src/util/string.c (view raw)
1/* Copyright (c) 2013-2019 Jeffrey Pfau
2 *
3 * This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6#include <mgba-util/string.h>
7
8#include <mgba-util/vector.h>
9
10#include <string.h>
11
12DEFINE_VECTOR(StringList, char*);
13
14#ifndef HAVE_STRNDUP
15char* strndup(const char* start, size_t len) {
16 // This is suboptimal, but anything recent should have strndup
17 char* out = malloc((len + 1) * sizeof(char));
18 strncpy(out, start, len);
19 out[len] = '\0';
20 return out;
21}
22#endif
23
24#ifndef HAVE_STRDUP
25char* strdup(const char* str) {
26 size_t len = strlen(str);
27 char* out = malloc(len + 1);
28 strncpy(out, str, len);
29 out[len] = '\0';
30 return out;
31}
32#endif
33
34#ifndef HAVE_STRLCPY
35size_t strlcpy(char* restrict dst, const char* restrict src, size_t dstsize) {
36 size_t i = 0;
37 for (; src[i] && dstsize > 1; ++i) {
38 dst[i] = src[i];
39 --dstsize;
40 }
41 if (dstsize) {
42 dst[i] = '\0';
43 }
44 while (src[i]) {
45 ++i;
46 }
47 return i;
48}
49#endif
50
51char* strnrstr(const char* restrict haystack, const char* restrict needle, size_t len) {
52 char* last = 0;
53 const char* next = haystack;
54 size_t needleLen = strlen(needle);
55 for (; len >= needleLen; --len, ++next) {
56 if (strncmp(needle, next, needleLen) == 0) {
57 last = (char*) next;
58 }
59 }
60 return last;
61}
62
63bool endswith(const char* restrict s1, const char* restrict end) {
64 size_t len = strlen(s1);
65 size_t endLen = strlen(end);
66 if (len < endLen) {
67 return false;
68 }
69 return strcmp(&s1[len - endLen], end) == 0;
70}
71
72bool startswith(const char* restrict s1, const char* restrict start) {
73 size_t len = strlen(s1);
74 size_t startLen = strlen(start);
75 if (len < startLen) {
76 return false;
77 }
78 return strncmp(s1, start, startLen) == 0;
79}
80
81uint32_t utf16Char(const uint16_t** unicode, size_t* length) {
82 if (*length < 2) {
83 *length = 0;
84 return 0;
85 }
86 uint32_t unichar = **unicode;
87 ++*unicode;
88 *length -= 2;
89 if (unichar < 0xD800 || unichar >= 0xE000) {
90 return unichar;
91 }
92 if (*length < 2) {
93 *length = 0;
94 return 0;
95 }
96 uint16_t highSurrogate = unichar;
97 uint16_t lowSurrogate = **unicode;
98 ++*unicode;
99 *length -= 2;
100 if (highSurrogate >= 0xDC00) {
101 return 0;
102 }
103 if (lowSurrogate < 0xDC00 || lowSurrogate >= 0xE000) {
104 return 0;
105 }
106 highSurrogate -= 0xD800;
107 lowSurrogate -= 0xDC00;
108 return (highSurrogate << 10) + lowSurrogate + 0x10000;
109}
110
111uint32_t utf8Char(const char** unicode, size_t* length) {
112 if (*length == 0) {
113 return 0;
114 }
115 char byte = **unicode;
116 --*length;
117 ++*unicode;
118 if (!(byte & 0x80)) {
119 return byte;
120 }
121 uint32_t unichar;
122 static const int tops[4] = { 0xC0, 0xE0, 0xF0, 0xF8 };
123 size_t numBytes;
124 for (numBytes = 0; numBytes < 3; ++numBytes) {
125 if ((byte & tops[numBytes + 1]) == tops[numBytes]) {
126 break;
127 }
128 }
129 unichar = byte & ~tops[numBytes];
130 if (numBytes == 3) {
131 return 0;
132 }
133 ++numBytes;
134 if (*length < numBytes) {
135 *length = 0;
136 return 0;
137 }
138 size_t i;
139 for (i = 0; i < numBytes; ++i) {
140 unichar <<= 6;
141 byte = **unicode;
142 --*length;
143 ++*unicode;
144 if ((byte & 0xC0) != 0x80) {
145 return 0;
146 }
147 unichar |= byte & 0x3F;
148 }
149 return unichar;
150}
151
152size_t toUtf8(uint32_t unichar, char* buffer) {
153 if (unichar > 0x10FFFF) {
154 unichar = 0xFFFD;
155 }
156 if (unichar < 0x80) {
157 buffer[0] = unichar;
158 return 1;
159 }
160 if (unichar < 0x800) {
161 buffer[0] = (unichar >> 6) | 0xC0;
162 buffer[1] = (unichar & 0x3F) | 0x80;
163 return 2;
164 }
165 if (unichar < 0x10000) {
166 buffer[0] = (unichar >> 12) | 0xE0;
167 buffer[1] = ((unichar >> 6) & 0x3F) | 0x80;
168 buffer[2] = (unichar & 0x3F) | 0x80;
169 return 3;
170 }
171 if (unichar < 0x200000) {
172 buffer[0] = (unichar >> 18) | 0xF0;
173 buffer[1] = ((unichar >> 12) & 0x3F) | 0x80;
174 buffer[2] = ((unichar >> 6) & 0x3F) | 0x80;
175 buffer[3] = (unichar & 0x3F) | 0x80;
176 return 4;
177 }
178
179 // This shouldn't be possible
180 return 0;
181}
182
183int utfcmp(const uint16_t* utf16, const char* utf8, size_t utf16Length, size_t utf8Length) {
184 uint32_t char1 = 0, char2 = 0;
185 while (utf16Length > 0 && utf8Length > 0) {
186 if (char1 < char2) {
187 return -1;
188 }
189 if (char1 > char2) {
190 return 1;
191 }
192 char1 = utf16Char(&utf16, &utf16Length);
193 char2 = utf8Char(&utf8, &utf8Length);
194 }
195 if (utf16Length == 0 && utf8Length > 0) {
196 return -1;
197 }
198 if (utf16Length > 0 && utf8Length == 0) {
199 return 1;
200 }
201 return 0;
202}
203
204char* utf16to8(const uint16_t* utf16, size_t length) {
205 char* utf8 = 0;
206 char* offset = 0;
207 char buffer[4];
208 size_t utf8TotalBytes = 0;
209 size_t utf8Length = 0;
210 while (true) {
211 if (length == 0) {
212 break;
213 }
214 uint32_t unichar = utf16Char(&utf16, &length);
215 size_t bytes = toUtf8(unichar, buffer);
216 utf8Length += bytes;
217 if (utf8Length < utf8TotalBytes) {
218 memcpy(offset, buffer, bytes);
219 offset += bytes;
220 } else if (!utf8) {
221 utf8 = malloc(length);
222 if (!utf8) {
223 return 0;
224 }
225 utf8TotalBytes = length;
226 memcpy(utf8, buffer, bytes);
227 offset = utf8 + bytes;
228 } else if (utf8Length >= utf8TotalBytes) {
229 ptrdiff_t o = offset - utf8;
230 char* newUTF8 = realloc(utf8, utf8TotalBytes * 2);
231 offset = o + newUTF8;
232 if (!newUTF8) {
233 free(utf8);
234 return 0;
235 }
236 utf8 = newUTF8;
237 memcpy(offset, buffer, bytes);
238 offset += bytes;
239 }
240 }
241
242 char* newUTF8 = realloc(utf8, utf8Length + 1);
243 if (!newUTF8) {
244 free(utf8);
245 return 0;
246 }
247 newUTF8[utf8Length] = '\0';
248 return newUTF8;
249}
250
251extern const uint16_t gbkUnicodeTable[];
252
253char* gbkToUtf8(const char* gbk, size_t length) {
254 char* utf8 = NULL;
255 char* utf8Offset = NULL;
256 size_t offset;
257 uint8_t gbk1 = 0;
258 char buffer[4];
259 size_t utf8TotalBytes = 0;
260 size_t utf8Length = 0;
261 for (offset = 0; offset < length; ++offset) {
262 if (length == 0) {
263 break;
264 }
265 unsigned unichar = 0xFFFD;
266 if (!gbk1 && !(gbk[offset] & 0x80)) {
267 unichar = gbk[offset];
268 } else if (gbk1) {
269 uint8_t gbk2 = gbk[offset];
270 if (gbk2 >= 0x40 && gbk2 != 0xFF) {
271 // TODO: GB-18030 support?
272 unichar = gbkUnicodeTable[gbk1 * 0xBF + gbk2 - 0x40];
273 }
274 gbk1 = 0;
275 } else if (((uint8_t*) gbk)[offset] == 0xFF) {
276 unichar = 0xFFFD;
277 } else if (((uint8_t*) gbk)[offset] == 0x80) {
278 unichar = 0x20AC; // Euro
279 } else {
280 gbk1 = ((uint8_t*) gbk)[offset] - 0x81;
281 continue;
282 }
283
284 size_t bytes = toUtf8(unichar, buffer);
285 utf8Length += bytes;
286 if (!utf8) {
287 utf8 = malloc(length);
288 if (!utf8) {
289 return NULL;
290 }
291 utf8TotalBytes = length;
292 memcpy(utf8, buffer, bytes);
293 utf8Offset = utf8 + bytes;
294 } else if (utf8Length < utf8TotalBytes) {
295 memcpy(utf8Offset, buffer, bytes);
296 utf8Offset += bytes;
297 } else if (utf8Length >= utf8TotalBytes) {
298 ptrdiff_t o = utf8Offset - utf8;
299 char* newUTF8 = realloc(utf8, utf8TotalBytes * 2);
300 utf8Offset = o + newUTF8;
301 if (!newUTF8) {
302 free(utf8);
303 return 0;
304 }
305 utf8 = newUTF8;
306 memcpy(utf8Offset, buffer, bytes);
307 utf8Offset += bytes;
308 }
309 }
310
311 char* newUTF8 = realloc(utf8, utf8Length + 1);
312 if (!newUTF8) {
313 free(utf8);
314 return 0;
315 }
316 newUTF8[utf8Length] = '\0';
317 return newUTF8;
318}
319
320int hexDigit(char digit) {
321 switch (digit) {
322 case '0':
323 case '1':
324 case '2':
325 case '3':
326 case '4':
327 case '5':
328 case '6':
329 case '7':
330 case '8':
331 case '9':
332 return digit - '0';
333
334 case 'a':
335 case 'b':
336 case 'c':
337 case 'd':
338 case 'e':
339 case 'f':
340 return digit - 'a' + 10;
341
342 case 'A':
343 case 'B':
344 case 'C':
345 case 'D':
346 case 'E':
347 case 'F':
348 return digit - 'A' + 10;
349
350 default:
351 return -1;
352 }
353}
354
355const char* hex32(const char* line, uint32_t* out) {
356 uint32_t value = 0;
357 int i;
358 for (i = 0; i < 8; ++i, ++line) {
359 char digit = *line;
360 value <<= 4;
361 int nybble = hexDigit(digit);
362 if (nybble < 0) {
363 return 0;
364 }
365 value |= nybble;
366 }
367 *out = value;
368 return line;
369}
370
371const char* hex24(const char* line, uint32_t* out) {
372 uint32_t value = 0;
373 int i;
374 for (i = 0; i < 6; ++i, ++line) {
375 char digit = *line;
376 value <<= 4;
377 int nybble = hexDigit(digit);
378 if (nybble < 0) {
379 return 0;
380 }
381 value |= nybble;
382 }
383 *out = value;
384 return line;
385}
386
387const char* hex16(const char* line, uint16_t* out) {
388 uint16_t value = 0;
389 *out = 0;
390 int i;
391 for (i = 0; i < 4; ++i, ++line) {
392 char digit = *line;
393 value <<= 4;
394 int nybble = hexDigit(digit);
395 if (nybble < 0) {
396 return 0;
397 }
398 value |= nybble;
399 }
400 *out = value;
401 return line;
402}
403
404const char* hex12(const char* line, uint16_t* out) {
405 uint16_t value = 0;
406 *out = 0;
407 int i;
408 for (i = 0; i < 3; ++i, ++line) {
409 char digit = *line;
410 value <<= 4;
411 int nybble = hexDigit(digit);
412 if (nybble < 0) {
413 return 0;
414 }
415 value |= nybble;
416 }
417 *out = value;
418 return line;
419}
420
421const char* hex8(const char* line, uint8_t* out) {
422 uint8_t value = 0;
423 *out = 0;
424 int i;
425 for (i = 0; i < 2; ++i, ++line) {
426 char digit = *line;
427 value <<= 4;
428 int nybble = hexDigit(digit);
429 if (nybble < 0) {
430 return 0;
431 }
432 value |= nybble;
433 }
434 *out = value;
435 return line;
436}
437
438const char* hex4(const char* line, uint8_t* out) {
439 uint8_t value = 0;
440 *out = 0;
441 char digit = *line;
442 int nybble = hexDigit(digit);
443 if (nybble < 0) {
444 return 0;
445 }
446 value |= nybble;
447 *out = value;
448 return line;
449}
450
451void rtrim(char* string) {
452 if (!*string) {
453 return;
454 }
455 char* end = string + strlen(string) - 1;
456 while (isspace((int) *end) && end >= string) {
457 *end = '\0';
458 --end;
459 }
460}
461
462ssize_t parseQuotedString(const char* unparsed, ssize_t unparsedLen, char* parsed, ssize_t parsedLen) {
463 memset(parsed, 0, parsedLen);
464 bool escaped = false;
465 char start = '\0';
466 ssize_t len = 0;
467 ssize_t i;
468 for (i = 0; i < unparsedLen && len < parsedLen; ++i) {
469 if (i == 0) {
470 switch (unparsed[0]) {
471 case '"':
472 case '\'':
473 start = unparsed[0];
474 break;
475 default:
476 return -1;
477 }
478 continue;
479 }
480 if (escaped) {
481 switch (unparsed[i]) {
482 case 'n':
483 parsed[len] = '\n';
484 break;
485 case 'r':
486 parsed[len] = '\r';
487 break;
488 case '\\':
489 parsed[len] = '\\';
490 break;
491 case '\'':
492 parsed[len] = '\'';
493 break;
494 case '"':
495 parsed[len] = '"';
496 break;
497 default:
498 return -1;
499 }
500 escaped = false;
501 ++len;
502 continue;
503 }
504 if (unparsed[i] == start) {
505 return len;
506 }
507 switch (unparsed[i]) {
508 case '\\':
509 escaped = true;
510 break;
511 case '\n':
512 case '\r':
513 return len;
514 default:
515 parsed[len] = unparsed[i];
516 ++len;
517 break;
518 }
519 }
520 return -1;
521}