1 /*	$OpenBSD: citrus_utf8.c,v 1.6 2012/12/05 23:19:59 deraadt Exp $ */
2 
3 /*-
4  * Copyright (c) 2002-2004 Tim J. Robbins
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <errno.h>
30 #include <string.h>
31 #include <sys/param.h>
32 #include <uchar.h>
33 #include <wchar.h>
34 
35 #include "private/bionic_mbstate.h"
36 
37 //
38 // This file is basically OpenBSD's citrus_utf8.c but rewritten to not require a
39 // 12-byte mbstate_t so we're backwards-compatible with our LP32 ABI where
40 // mbstate_t was only 4 bytes.
41 //
42 // The state is the UTF-8 sequence. We only support <= 4-bytes sequences so LP32
43 // mbstate_t already has enough space (out of the 4 available bytes we only
44 // need 3 since we should never need to store the entire sequence in the
45 // intermediary state).
46 //
47 // The C standard leaves the conversion state undefined after a bad conversion.
48 // To avoid unexpected failures due to the possible use of the internal private
49 // state we always reset the conversion state when encountering illegal
50 // sequences.
51 //
52 // We also implement the POSIX interface directly rather than being accessed via
53 // function pointers.
54 //
55 
mbsinit(const mbstate_t * ps)56 int mbsinit(const mbstate_t* ps) {
57   return ps == nullptr || mbstate_is_initial(ps);
58 }
59 
mbrtowc(wchar_t * pwc,const char * s,size_t n,mbstate_t * ps)60 size_t mbrtowc(wchar_t* pwc, const char* s, size_t n, mbstate_t* ps) {
61   static mbstate_t __private_state;
62   mbstate_t* state = (ps == nullptr) ? &__private_state : ps;
63 
64   // Our wchar_t is UTF-32.
65   return mbrtoc32(reinterpret_cast<char32_t*>(pwc), s, n, state);
66 }
67 
mbsnrtowcs(wchar_t * dst,const char ** src,size_t nmc,size_t len,mbstate_t * ps)68 size_t mbsnrtowcs(wchar_t* dst, const char** src, size_t nmc, size_t len, mbstate_t* ps) {
69   static mbstate_t __private_state;
70   mbstate_t* state = (ps == nullptr) ? &__private_state : ps;
71   size_t i, o, r;
72 
73   // The fast paths in the loops below are not safe if an ASCII
74   // character appears as anything but the first byte of a
75   // multibyte sequence. Check now to avoid doing it in the loops.
76   if (nmc > 0 && mbstate_bytes_so_far(state) > 0 && static_cast<uint8_t>((*src)[0]) < 0x80) {
77     return mbstate_reset_and_return_illegal(EILSEQ, state);
78   }
79 
80   // Measure only?
81   if (dst == nullptr) {
82     for (i = o = 0; i < nmc; i += r, o++) {
83       if (static_cast<uint8_t>((*src)[i]) < 0x80) {
84         // Fast path for plain ASCII characters.
85         if ((*src)[i] == '\0') {
86           return mbstate_reset_and_return(o, state);
87         }
88         r = 1;
89       } else {
90         r = mbrtowc(nullptr, *src + i, nmc - i, state);
91         if (r == BIONIC_MULTIBYTE_RESULT_ILLEGAL_SEQUENCE) {
92           return mbstate_reset_and_return_illegal(EILSEQ, state);
93         }
94         if (r == BIONIC_MULTIBYTE_RESULT_INCOMPLETE_SEQUENCE) {
95           return mbstate_reset_and_return_illegal(EILSEQ, state);
96         }
97         if (r == 0) {
98           return mbstate_reset_and_return(o, state);
99         }
100       }
101     }
102     return mbstate_reset_and_return(o, state);
103   }
104 
105   // Actually convert, updating `dst` and `src`.
106   for (i = o = 0; i < nmc && o < len; i += r, o++) {
107     if (static_cast<uint8_t>((*src)[i]) < 0x80) {
108       // Fast path for plain ASCII characters.
109       dst[o] = (*src)[i];
110       r = 1;
111       if ((*src)[i] == '\0') {
112         *src = nullptr;
113         return mbstate_reset_and_return(o, state);
114       }
115     } else {
116       r = mbrtowc(dst + o, *src + i, nmc - i, state);
117       if (r == BIONIC_MULTIBYTE_RESULT_ILLEGAL_SEQUENCE) {
118         *src += i;
119         return mbstate_reset_and_return_illegal(EILSEQ, state);
120       }
121       if (r == BIONIC_MULTIBYTE_RESULT_INCOMPLETE_SEQUENCE) {
122         *src += nmc;
123         return mbstate_reset_and_return_illegal(EILSEQ, state);
124       }
125       if (r == 0) {
126         *src = nullptr;
127         return mbstate_reset_and_return(o, state);
128       }
129     }
130   }
131   *src += i;
132   return mbstate_reset_and_return(o, state);
133 }
134 
mbsrtowcs(wchar_t * dst,const char ** src,size_t len,mbstate_t * ps)135 size_t mbsrtowcs(wchar_t* dst, const char** src, size_t len, mbstate_t* ps) {
136   return mbsnrtowcs(dst, src, SIZE_MAX, len, ps);
137 }
138 __strong_alias(mbsrtowcs_l, mbsrtowcs);
139 
wcrtomb(char * s,wchar_t wc,mbstate_t * ps)140 size_t wcrtomb(char* s, wchar_t wc, mbstate_t* ps) {
141   static mbstate_t __private_state;
142   mbstate_t* state = (ps == nullptr) ? &__private_state : ps;
143 
144   // Our wchar_t is UTF-32.
145   return c32rtomb(s, static_cast<char32_t>(wc), state);
146 }
147 
wcsnrtombs(char * dst,const wchar_t ** src,size_t nwc,size_t len,mbstate_t * ps)148 size_t wcsnrtombs(char* dst, const wchar_t** src, size_t nwc, size_t len, mbstate_t* ps) {
149   static mbstate_t __private_state;
150   mbstate_t* state = (ps == nullptr) ? &__private_state : ps;
151 
152   if (!mbstate_is_initial(state)) {
153     return mbstate_reset_and_return_illegal(EILSEQ, state);
154   }
155 
156   char buf[MB_LEN_MAX];
157   size_t i, o, r;
158   if (dst == nullptr) {
159     for (i = o = 0; i < nwc; i++, o += r) {
160       wchar_t wc = (*src)[i];
161       if (static_cast<uint32_t>(wc) < 0x80) {
162         // Fast path for plain ASCII characters.
163         if (wc == 0) {
164           return o;
165         }
166         r = 1;
167       } else {
168         r = wcrtomb(buf, wc, state);
169         if (r == BIONIC_MULTIBYTE_RESULT_ILLEGAL_SEQUENCE) {
170           return r;
171         }
172       }
173     }
174     return o;
175   }
176 
177   for (i = o = 0; i < nwc && o < len; i++, o += r) {
178     wchar_t wc = (*src)[i];
179     if (static_cast<uint32_t>(wc) < 0x80) {
180       // Fast path for plain ASCII characters.
181       dst[o] = wc;
182       if (wc == 0) {
183         *src = nullptr;
184         return o;
185       }
186       r = 1;
187     } else if (len - o >= sizeof(buf)) {
188       // Enough space to translate in-place.
189       r = wcrtomb(dst + o, wc, state);
190       if (r == BIONIC_MULTIBYTE_RESULT_ILLEGAL_SEQUENCE) {
191         *src += i;
192         return r;
193       }
194     } else {
195       // May not be enough space; use temp buffer.
196       r = wcrtomb(buf, wc, state);
197       if (r == BIONIC_MULTIBYTE_RESULT_ILLEGAL_SEQUENCE) {
198         *src += i;
199         return r;
200       }
201       if (r > len - o) {
202         break;
203       }
204       memcpy(dst + o, buf, r);
205     }
206   }
207   *src += i;
208   return o;
209 }
210 
wcsrtombs(char * dst,const wchar_t ** src,size_t len,mbstate_t * ps)211 size_t wcsrtombs(char* dst, const wchar_t** src, size_t len, mbstate_t* ps) {
212   return wcsnrtombs(dst, src, SIZE_MAX, len, ps);
213 }
214 __strong_alias(wcsrtombs_l, wcsrtombs);
215