src/util/string.c (view raw)
1/* Copyright (c) 2013-2015 Jeffrey Pfau
2 *
3 * This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6#include "util/string.h"
7
8#include <string.h>
9
10#ifndef HAVE_STRNDUP
11char* strndup(const char* start, size_t len) {
12 // This is suboptimal, but anything recent should have strndup
13 char* out = malloc((len + 1) * sizeof(char));
14 strncpy(out, start, len);
15 out[len] = '\0';
16 return out;
17}
18#endif
19
20#ifndef HAVE_STRDUP
21char* strdup(const char* str) {
22 size_t len = strlen(str);
23 char* out = malloc(len + 1);
24 strncpy(out, str, len);
25 out[len] = '\0';
26 return out;
27}
28#endif
29
30char* strnrstr(const char* restrict haystack, const char* restrict needle, size_t len) {
31 char* last = 0;
32 const char* next = haystack;
33 size_t needleLen = strlen(needle);
34 for (; len >= needleLen; --len, ++next) {
35 if (strncmp(needle, next, needleLen) == 0) {
36 last = (char*) next;
37 }
38 }
39 return last;
40}
41
42uint32_t utf16Char(const uint16_t** unicode, size_t* length) {
43 if (*length < 2) {
44 *length = 0;
45 return 0;
46 }
47 uint32_t unichar = **unicode;
48 ++*unicode;
49 *length -= 2;
50 if (unichar < 0xD800 || unichar >= 0xE000) {
51 return unichar;
52 }
53 if (*length < 2) {
54 *length = 0;
55 return 0;
56 }
57 uint16_t highSurrogate = unichar;
58 uint16_t lowSurrogate = **unicode;
59 ++*unicode;
60 *length -= 2;
61 if (highSurrogate >= 0xDC00) {
62 return 0;
63 }
64 if (lowSurrogate < 0xDC00 || lowSurrogate >= 0xE000) {
65 return 0;
66 }
67 highSurrogate -= 0xD800;
68 lowSurrogate -= 0xDC00;
69 return (highSurrogate << 10) + lowSurrogate + 0x10000;
70}
71
72uint32_t utf8Char(const char** unicode, size_t* length) {
73 if (*length == 0) {
74 return 0;
75 }
76 char byte = **unicode;
77 --*length;
78 ++*unicode;
79 if (!(byte & 0x80)) {
80 return byte;
81 }
82 uint32_t unichar;
83 static int tops[4] = { 0xC0, 0xE0, 0xF0, 0xF8 };
84 size_t numBytes;
85 for (numBytes = 0; numBytes < 3; ++numBytes) {
86 if ((byte & tops[numBytes + 1]) == tops[numBytes]) {
87 break;
88 }
89 }
90 unichar = byte & ~tops[numBytes];
91 if (numBytes == 3) {
92 return 0;
93 }
94 ++numBytes;
95 if (*length < numBytes) {
96 *length = 0;
97 return 0;
98 }
99 size_t i;
100 for (i = 0; i < numBytes; ++i) {
101 unichar <<= 6;
102 byte = **unicode;
103 --*length;
104 ++*unicode;
105 if ((byte & 0xC0) != 0x80) {
106 return 0;
107 }
108 unichar |= byte & 0x3F;
109 }
110 return unichar;
111}
112
113static size_t _toUtf8(uint32_t unichar, char* buffer) {
114 if (unichar > 0x10FFFF) {
115 unichar = 0xFFFD;
116 }
117 if (unichar < 0x80) {
118 buffer[0] = unichar;
119 return 1;
120 }
121 if (unichar < 0x800) {
122 buffer[0] = (unichar >> 6) | 0xC0;
123 buffer[1] = (unichar & 0x3F) | 0x80;
124 return 2;
125 }
126 if (unichar < 0x10000) {
127 buffer[0] = (unichar >> 12) | 0xE0;
128 buffer[1] = ((unichar >> 6) & 0x3F) | 0x80;
129 buffer[2] = (unichar & 0x3F) | 0x80;
130 return 3;
131 }
132 if (unichar < 0x200000) {
133 buffer[0] = (unichar >> 18) | 0xF0;
134 buffer[1] = ((unichar >> 12) & 0x3F) | 0x80;
135 buffer[2] = ((unichar >> 6) & 0x3F) | 0x80;
136 buffer[3] = (unichar & 0x3F) | 0x80;
137 return 4;
138 }
139
140 // This shouldn't be possible
141 return 0;
142}
143
144int utfcmp(const uint16_t* utf16, const char* utf8, size_t utf16Length, size_t utf8Length) {
145 uint32_t char1 = 0, char2 = 0;
146 while (utf16Length > 0 && utf8Length > 0) {
147 if (char1 < char2) {
148 return -1;
149 }
150 if (char1 > char2) {
151 return 1;
152 }
153 char1 = utf16Char(&utf16, &utf16Length);
154 char2 = utf8Char(&utf8, &utf8Length);
155 }
156 if (utf16Length == 0 && utf8Length > 0) {
157 return -1;
158 }
159 if (utf16Length > 0 && utf8Length == 0) {
160 return 1;
161 }
162 return 0;
163}
164
165char* utf16to8(const uint16_t* utf16, size_t length) {
166 char* utf8 = 0;
167 char* offset = 0;
168 char buffer[4];
169 size_t utf8TotalBytes = 0;
170 size_t utf8Length = 0;
171 while (true) {
172 if (length == 0) {
173 break;
174 }
175 uint32_t unichar = utf16Char(&utf16, &length);
176 size_t bytes = _toUtf8(unichar, buffer);
177 utf8Length += bytes;
178 if (utf8Length < utf8TotalBytes) {
179 memcpy(offset, buffer, bytes);
180 offset += bytes;
181 } else if (!utf8) {
182 utf8 = malloc(length);
183 if (!utf8) {
184 return 0;
185 }
186 utf8TotalBytes = length;
187 memcpy(utf8, buffer, bytes);
188 offset = utf8 + bytes;
189 } else if (utf8Length >= utf8TotalBytes) {
190 char* newUTF8 = realloc(utf8, utf8TotalBytes * 2);
191 offset = offset - utf8 + newUTF8;
192 if (newUTF8 != utf8) {
193 free(utf8);
194 }
195 if (!newUTF8) {
196 return 0;
197 }
198 utf8 = newUTF8;
199 memcpy(offset, buffer, bytes);
200 offset += bytes;
201 }
202 }
203
204 char* newUTF8 = realloc(utf8, utf8Length + 1);
205 if (!newUTF8) {
206 free(utf8);
207 return 0;
208 }
209 newUTF8[utf8Length] = '\0';
210 return newUTF8;
211}
212
213int hexDigit(char digit) {
214 switch (digit) {
215 case '0':
216 case '1':
217 case '2':
218 case '3':
219 case '4':
220 case '5':
221 case '6':
222 case '7':
223 case '8':
224 case '9':
225 return digit - '0';
226
227 case 'a':
228 case 'b':
229 case 'c':
230 case 'd':
231 case 'e':
232 case 'f':
233 return digit - 'a' + 10;
234
235 case 'A':
236 case 'B':
237 case 'C':
238 case 'D':
239 case 'E':
240 case 'F':
241 return digit - 'A' + 10;
242
243 default:
244 return -1;
245 }
246}
247
248const char* hex32(const char* line, uint32_t* out) {
249 uint32_t value = 0;
250 int i;
251 for (i = 0; i < 8; ++i, ++line) {
252 char digit = *line;
253 value <<= 4;
254 int nybble = hexDigit(digit);
255 if (nybble < 0) {
256 return 0;
257 }
258 value |= nybble;
259 }
260 *out = value;
261 return line;
262}
263
264const char* hex24(const char* line, uint32_t* out) {
265 uint32_t value = 0;
266 int i;
267 for (i = 0; i < 6; ++i, ++line) {
268 char digit = *line;
269 value <<= 4;
270 int nybble = hexDigit(digit);
271 if (nybble < 0) {
272 return 0;
273 }
274 value |= nybble;
275 }
276 *out = value;
277 return line;
278}
279
280const char* hex16(const char* line, uint16_t* out) {
281 uint16_t value = 0;
282 *out = 0;
283 int i;
284 for (i = 0; i < 4; ++i, ++line) {
285 char digit = *line;
286 value <<= 4;
287 int nybble = hexDigit(digit);
288 if (nybble < 0) {
289 return 0;
290 }
291 value |= nybble;
292 }
293 *out = value;
294 return line;
295}
296
297const char* hex12(const char* line, uint16_t* out) {
298 uint16_t value = 0;
299 *out = 0;
300 int i;
301 for (i = 0; i < 3; ++i, ++line) {
302 char digit = *line;
303 value <<= 4;
304 int nybble = hexDigit(digit);
305 if (nybble < 0) {
306 return 0;
307 }
308 value |= nybble;
309 }
310 *out = value;
311 return line;
312}
313
314const char* hex8(const char* line, uint8_t* out) {
315 uint8_t value = 0;
316 *out = 0;
317 int i;
318 for (i = 0; i < 2; ++i, ++line) {
319 char digit = *line;
320 value <<= 4;
321 int nybble = hexDigit(digit);
322 if (nybble < 0) {
323 return 0;
324 }
325 value |= nybble;
326 }
327 *out = value;
328 return line;
329}
330
331const char* hex4(const char* line, uint8_t* out) {
332 uint8_t value = 0;
333 *out = 0;
334 char digit = *line;
335 value <<= 4;
336 int nybble = hexDigit(digit);
337 if (nybble < 0) {
338 return 0;
339 }
340 value |= nybble;
341 *out = value;
342 return line;
343}