src/util/string.c (view raw)
1/* Copyright (c) 2013-2015 Jeffrey Pfau
2 *
3 * This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6#include "util/string.h"
7
8#include <string.h>
9
10#ifndef HAVE_STRNDUP
11char* strndup(const char* start, size_t len) {
12 // This is suboptimal, but anything recent should have strndup
13 char* out = malloc((len + 1) * sizeof(char));
14 strncpy(out, start, len);
15 out[len] = '\0';
16 return out;
17}
18#endif
19
20#ifndef HAVE_STRDUP
21char* strdup(const char* str) {
22 size_t len = strlen(str);
23 char* out = malloc(len + 1);
24 strncpy(out, str, len);
25 out[len] = '\0';
26 return out;
27}
28#endif
29
30char* strnrstr(const char* restrict haystack, const char* restrict needle, size_t len) {
31 char* last = 0;
32 const char* next = haystack;
33 size_t needleLen = strlen(needle);
34 for (; len >= needleLen; --len, ++next) {
35 if (strncmp(needle, next, needleLen) == 0) {
36 last = (char*) next;
37 }
38 }
39 return last;
40}
41
42uint32_t utf16Char(const uint16_t** unicode, size_t* length) {
43 if (*length < 2) {
44 *length = 0;
45 return 0;
46 }
47 uint32_t unichar = **unicode;
48 ++*unicode;
49 *length -= 2;
50 if (unichar < 0xD800 || unichar >= 0xE000) {
51 return unichar;
52 }
53 if (*length < 2) {
54 *length = 0;
55 return 0;
56 }
57 uint16_t highSurrogate = unichar;
58 uint16_t lowSurrogate = **unicode;
59 ++*unicode;
60 *length -= 2;
61 if (highSurrogate >= 0xDC00) {
62 return 0;
63 }
64 if (lowSurrogate < 0xDC00 || lowSurrogate >= 0xE000) {
65 return 0;
66 }
67 highSurrogate -= 0xD800;
68 lowSurrogate -= 0xDC00;
69 return (highSurrogate << 10) + lowSurrogate + 0x10000;
70}
71
72uint32_t utf8Char(const char** unicode, size_t* length) {
73 if (*length == 0) {
74 return 0;
75 }
76 char byte = **unicode;
77 --*length;
78 ++*unicode;
79 if (!(byte & 0x80)) {
80 return byte;
81 }
82 uint32_t unichar;
83 static int tops[4] = { 0xC0, 0xE0, 0xF0, 0xF8 };
84 size_t numBytes;
85 for (numBytes = 0; numBytes < 3; ++numBytes) {
86 if ((byte & tops[numBytes + 1]) == tops[numBytes]) {
87 break;
88 }
89 }
90 unichar = byte & ~tops[numBytes];
91 if (numBytes == 3) {
92 return 0;
93 }
94 ++numBytes;
95 if (*length < numBytes) {
96 *length = 0;
97 return 0;
98 }
99 size_t i;
100 for (i = 0; i < numBytes; ++i) {
101 unichar <<= 6;
102 byte = **unicode;
103 --*length;
104 ++*unicode;
105 if ((byte & 0xC0) != 0x80) {
106 return 0;
107 }
108 unichar |= byte & 0x3F;
109 }
110 return unichar;
111}
112
113static size_t _toUtf8(uint32_t unichar, char* buffer) {
114 if (unichar > 0x10FFFF) {
115 unichar = 0xFFFD;
116 }
117 if (unichar < 0x80) {
118 buffer[0] = unichar;
119 return 1;
120 }
121 if (unichar < 0x800) {
122 buffer[0] = (unichar >> 6) | 0xC0;
123 buffer[1] = (unichar & 0x3F) | 0x80;
124 return 2;
125 }
126 if (unichar < 0x10000) {
127 buffer[0] = (unichar >> 12) | 0xE0;
128 buffer[1] = ((unichar >> 6) & 0x3F) | 0x80;
129 buffer[2] = (unichar & 0x3F) | 0x80;
130 return 3;
131 }
132 if (unichar < 0x200000) {
133 buffer[0] = (unichar >> 18) | 0xF0;
134 buffer[1] = ((unichar >> 12) & 0x3F) | 0x80;
135 buffer[2] = ((unichar >> 6) & 0x3F) | 0x80;
136 buffer[3] = (unichar & 0x3F) | 0x80;
137 return 4;
138 }
139
140 // This shouldn't be possible
141 return 0;
142}
143
144int utfcmp(const uint16_t* utf16, const char* utf8, size_t utf16Length, size_t utf8Length) {
145 uint32_t char1 = 0, char2 = 0;
146 while (utf16Length > 0 && utf8Length > 0) {
147 if (char1 < char2) {
148 return -1;
149 }
150 if (char1 > char2) {
151 return 1;
152 }
153 char1 = utf16Char(&utf16, &utf16Length);
154 char2 = utf8Char(&utf8, &utf8Length);
155 }
156 if (utf16Length == 0 && utf8Length > 0) {
157 return -1;
158 }
159 if (utf16Length > 0 && utf8Length == 0) {
160 return 1;
161 }
162 return 0;
163}
164
165char* utf16to8(const uint16_t* utf16, size_t length) {
166 char* utf8 = 0;
167 char* offset = 0;
168 char buffer[4];
169 size_t utf8TotalBytes = 0;
170 size_t utf8Length = 0;
171 while (true) {
172 if (length == 0) {
173 break;
174 }
175 uint32_t unichar = utf16Char(&utf16, &length);
176 size_t bytes = _toUtf8(unichar, buffer);
177 utf8Length += bytes;
178 if (utf8Length < utf8TotalBytes) {
179 memcpy(offset, buffer, bytes);
180 offset += bytes;
181 } else if (!utf8) {
182 utf8 = malloc(length);
183 if (!utf8) {
184 return 0;
185 }
186 utf8TotalBytes = length;
187 memcpy(utf8, buffer, bytes);
188 offset = utf8 + bytes;
189 } else if (utf8Length >= utf8TotalBytes) {
190 ptrdiff_t o = offset - utf8;
191 char* newUTF8 = realloc(utf8, utf8TotalBytes * 2);
192 offset = o + newUTF8;
193 if (!newUTF8) {
194 free(utf8);
195 return 0;
196 }
197 utf8 = newUTF8;
198 memcpy(offset, buffer, bytes);
199 offset += bytes;
200 }
201 }
202
203 char* newUTF8 = realloc(utf8, utf8Length + 1);
204 if (!newUTF8) {
205 free(utf8);
206 return 0;
207 }
208 newUTF8[utf8Length] = '\0';
209 return newUTF8;
210}
211
212int hexDigit(char digit) {
213 switch (digit) {
214 case '0':
215 case '1':
216 case '2':
217 case '3':
218 case '4':
219 case '5':
220 case '6':
221 case '7':
222 case '8':
223 case '9':
224 return digit - '0';
225
226 case 'a':
227 case 'b':
228 case 'c':
229 case 'd':
230 case 'e':
231 case 'f':
232 return digit - 'a' + 10;
233
234 case 'A':
235 case 'B':
236 case 'C':
237 case 'D':
238 case 'E':
239 case 'F':
240 return digit - 'A' + 10;
241
242 default:
243 return -1;
244 }
245}
246
247const char* hex32(const char* line, uint32_t* out) {
248 uint32_t value = 0;
249 int i;
250 for (i = 0; i < 8; ++i, ++line) {
251 char digit = *line;
252 value <<= 4;
253 int nybble = hexDigit(digit);
254 if (nybble < 0) {
255 return 0;
256 }
257 value |= nybble;
258 }
259 *out = value;
260 return line;
261}
262
263const char* hex24(const char* line, uint32_t* out) {
264 uint32_t value = 0;
265 int i;
266 for (i = 0; i < 6; ++i, ++line) {
267 char digit = *line;
268 value <<= 4;
269 int nybble = hexDigit(digit);
270 if (nybble < 0) {
271 return 0;
272 }
273 value |= nybble;
274 }
275 *out = value;
276 return line;
277}
278
279const char* hex16(const char* line, uint16_t* out) {
280 uint16_t value = 0;
281 *out = 0;
282 int i;
283 for (i = 0; i < 4; ++i, ++line) {
284 char digit = *line;
285 value <<= 4;
286 int nybble = hexDigit(digit);
287 if (nybble < 0) {
288 return 0;
289 }
290 value |= nybble;
291 }
292 *out = value;
293 return line;
294}
295
296const char* hex12(const char* line, uint16_t* out) {
297 uint16_t value = 0;
298 *out = 0;
299 int i;
300 for (i = 0; i < 3; ++i, ++line) {
301 char digit = *line;
302 value <<= 4;
303 int nybble = hexDigit(digit);
304 if (nybble < 0) {
305 return 0;
306 }
307 value |= nybble;
308 }
309 *out = value;
310 return line;
311}
312
313const char* hex8(const char* line, uint8_t* out) {
314 uint8_t value = 0;
315 *out = 0;
316 int i;
317 for (i = 0; i < 2; ++i, ++line) {
318 char digit = *line;
319 value <<= 4;
320 int nybble = hexDigit(digit);
321 if (nybble < 0) {
322 return 0;
323 }
324 value |= nybble;
325 }
326 *out = value;
327 return line;
328}
329
330const char* hex4(const char* line, uint8_t* out) {
331 uint8_t value = 0;
332 *out = 0;
333 char digit = *line;
334 value <<= 4;
335 int nybble = hexDigit(digit);
336 if (nybble < 0) {
337 return 0;
338 }
339 value |= nybble;
340 *out = value;
341 return line;
342}