src/util/string.c (view raw)
1/* Copyright (c) 2013-2019 Jeffrey Pfau
2 *
3 * This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6#include <mgba-util/string.h>
7
8#include <mgba-util/vector.h>
9
10#include <string.h>
11
12DEFINE_VECTOR(StringList, char*);
13
14#ifndef HAVE_STRNDUP
15char* strndup(const char* start, size_t len) {
16 // This is suboptimal, but anything recent should have strndup
17 char* out = malloc((len + 1) * sizeof(char));
18 strncpy(out, start, len);
19 out[len] = '\0';
20 return out;
21}
22#endif
23
24#ifndef HAVE_STRDUP
25char* strdup(const char* str) {
26 size_t len = strlen(str);
27 char* out = malloc(len + 1);
28 strncpy(out, str, len);
29 out[len] = '\0';
30 return out;
31}
32#endif
33
34char* strnrstr(const char* restrict haystack, const char* restrict needle, size_t len) {
35 char* last = 0;
36 const char* next = haystack;
37 size_t needleLen = strlen(needle);
38 for (; len >= needleLen; --len, ++next) {
39 if (strncmp(needle, next, needleLen) == 0) {
40 last = (char*) next;
41 }
42 }
43 return last;
44}
45
46bool endswith(const char* restrict s1, const char* restrict end) {
47 size_t len = strlen(s1);
48 size_t endLen = strlen(end);
49 if (len < endLen) {
50 return false;
51 }
52 return strcmp(&s1[len - endLen], end) == 0;
53}
54
55bool startswith(const char* restrict s1, const char* restrict start) {
56 size_t len = strlen(s1);
57 size_t startLen = strlen(start);
58 if (len < startLen) {
59 return false;
60 }
61 return strncmp(s1, start, startLen) == 0;
62}
63
64uint32_t utf16Char(const uint16_t** unicode, size_t* length) {
65 if (*length < 2) {
66 *length = 0;
67 return 0;
68 }
69 uint32_t unichar = **unicode;
70 ++*unicode;
71 *length -= 2;
72 if (unichar < 0xD800 || unichar >= 0xE000) {
73 return unichar;
74 }
75 if (*length < 2) {
76 *length = 0;
77 return 0;
78 }
79 uint16_t highSurrogate = unichar;
80 uint16_t lowSurrogate = **unicode;
81 ++*unicode;
82 *length -= 2;
83 if (highSurrogate >= 0xDC00) {
84 return 0;
85 }
86 if (lowSurrogate < 0xDC00 || lowSurrogate >= 0xE000) {
87 return 0;
88 }
89 highSurrogate -= 0xD800;
90 lowSurrogate -= 0xDC00;
91 return (highSurrogate << 10) + lowSurrogate + 0x10000;
92}
93
94uint32_t utf8Char(const char** unicode, size_t* length) {
95 if (*length == 0) {
96 return 0;
97 }
98 char byte = **unicode;
99 --*length;
100 ++*unicode;
101 if (!(byte & 0x80)) {
102 return byte;
103 }
104 uint32_t unichar;
105 static const int tops[4] = { 0xC0, 0xE0, 0xF0, 0xF8 };
106 size_t numBytes;
107 for (numBytes = 0; numBytes < 3; ++numBytes) {
108 if ((byte & tops[numBytes + 1]) == tops[numBytes]) {
109 break;
110 }
111 }
112 unichar = byte & ~tops[numBytes];
113 if (numBytes == 3) {
114 return 0;
115 }
116 ++numBytes;
117 if (*length < numBytes) {
118 *length = 0;
119 return 0;
120 }
121 size_t i;
122 for (i = 0; i < numBytes; ++i) {
123 unichar <<= 6;
124 byte = **unicode;
125 --*length;
126 ++*unicode;
127 if ((byte & 0xC0) != 0x80) {
128 return 0;
129 }
130 unichar |= byte & 0x3F;
131 }
132 return unichar;
133}
134
135size_t toUtf8(uint32_t unichar, char* buffer) {
136 if (unichar > 0x10FFFF) {
137 unichar = 0xFFFD;
138 }
139 if (unichar < 0x80) {
140 buffer[0] = unichar;
141 return 1;
142 }
143 if (unichar < 0x800) {
144 buffer[0] = (unichar >> 6) | 0xC0;
145 buffer[1] = (unichar & 0x3F) | 0x80;
146 return 2;
147 }
148 if (unichar < 0x10000) {
149 buffer[0] = (unichar >> 12) | 0xE0;
150 buffer[1] = ((unichar >> 6) & 0x3F) | 0x80;
151 buffer[2] = (unichar & 0x3F) | 0x80;
152 return 3;
153 }
154 if (unichar < 0x200000) {
155 buffer[0] = (unichar >> 18) | 0xF0;
156 buffer[1] = ((unichar >> 12) & 0x3F) | 0x80;
157 buffer[2] = ((unichar >> 6) & 0x3F) | 0x80;
158 buffer[3] = (unichar & 0x3F) | 0x80;
159 return 4;
160 }
161
162 // This shouldn't be possible
163 return 0;
164}
165
166int utfcmp(const uint16_t* utf16, const char* utf8, size_t utf16Length, size_t utf8Length) {
167 uint32_t char1 = 0, char2 = 0;
168 while (utf16Length > 0 && utf8Length > 0) {
169 if (char1 < char2) {
170 return -1;
171 }
172 if (char1 > char2) {
173 return 1;
174 }
175 char1 = utf16Char(&utf16, &utf16Length);
176 char2 = utf8Char(&utf8, &utf8Length);
177 }
178 if (utf16Length == 0 && utf8Length > 0) {
179 return -1;
180 }
181 if (utf16Length > 0 && utf8Length == 0) {
182 return 1;
183 }
184 return 0;
185}
186
187char* utf16to8(const uint16_t* utf16, size_t length) {
188 char* utf8 = 0;
189 char* offset = 0;
190 char buffer[4];
191 size_t utf8TotalBytes = 0;
192 size_t utf8Length = 0;
193 while (true) {
194 if (length == 0) {
195 break;
196 }
197 uint32_t unichar = utf16Char(&utf16, &length);
198 size_t bytes = toUtf8(unichar, buffer);
199 utf8Length += bytes;
200 if (utf8Length < utf8TotalBytes) {
201 memcpy(offset, buffer, bytes);
202 offset += bytes;
203 } else if (!utf8) {
204 utf8 = malloc(length);
205 if (!utf8) {
206 return 0;
207 }
208 utf8TotalBytes = length;
209 memcpy(utf8, buffer, bytes);
210 offset = utf8 + bytes;
211 } else if (utf8Length >= utf8TotalBytes) {
212 ptrdiff_t o = offset - utf8;
213 char* newUTF8 = realloc(utf8, utf8TotalBytes * 2);
214 offset = o + newUTF8;
215 if (!newUTF8) {
216 free(utf8);
217 return 0;
218 }
219 utf8 = newUTF8;
220 memcpy(offset, buffer, bytes);
221 offset += bytes;
222 }
223 }
224
225 char* newUTF8 = realloc(utf8, utf8Length + 1);
226 if (!newUTF8) {
227 free(utf8);
228 return 0;
229 }
230 newUTF8[utf8Length] = '\0';
231 return newUTF8;
232}
233
234extern const uint16_t gbkUnicodeTable[];
235
236char* gbkToUtf8(const char* gbk, size_t length) {
237 char* utf8 = NULL;
238 char* utf8Offset = NULL;
239 size_t offset;
240 uint8_t gbk1 = 0;
241 char buffer[4];
242 size_t utf8TotalBytes = 0;
243 size_t utf8Length = 0;
244 for (offset = 0; offset < length; ++offset) {
245 if (length == 0) {
246 break;
247 }
248 unsigned unichar = 0xFFFD;
249 if (!gbk1 && !(gbk[offset] & 0x80)) {
250 unichar = gbk[offset];
251 } else if (gbk1) {
252 uint8_t gbk2 = gbk[offset];
253 if (gbk2 >= 0x40 && gbk2 != 0xFF) {
254 // TODO: GB-18030 support?
255 unichar = gbkUnicodeTable[gbk1 * 0xBF + gbk2 - 0x40];
256 }
257 gbk1 = 0;
258 } else if (((uint8_t*) gbk)[offset] == 0xFF) {
259 unichar = 0xFFFD;
260 } else if (((uint8_t*) gbk)[offset] == 0x80) {
261 unichar = 0x20AC; // Euro
262 } else {
263 gbk1 = ((uint8_t*) gbk)[offset] - 0x81;
264 continue;
265 }
266
267 size_t bytes = toUtf8(unichar, buffer);
268 utf8Length += bytes;
269 if (!utf8) {
270 utf8 = malloc(length);
271 if (!utf8) {
272 return NULL;
273 }
274 utf8TotalBytes = length;
275 memcpy(utf8, buffer, bytes);
276 utf8Offset = utf8 + bytes;
277 } else if (utf8Length < utf8TotalBytes) {
278 memcpy(utf8Offset, buffer, bytes);
279 utf8Offset += bytes;
280 } else if (utf8Length >= utf8TotalBytes) {
281 ptrdiff_t o = utf8Offset - utf8;
282 char* newUTF8 = realloc(utf8, utf8TotalBytes * 2);
283 utf8Offset = o + newUTF8;
284 if (!newUTF8) {
285 free(utf8);
286 return 0;
287 }
288 utf8 = newUTF8;
289 memcpy(utf8Offset, buffer, bytes);
290 utf8Offset += bytes;
291 }
292 }
293
294 char* newUTF8 = realloc(utf8, utf8Length + 1);
295 if (!newUTF8) {
296 free(utf8);
297 return 0;
298 }
299 newUTF8[utf8Length] = '\0';
300 return newUTF8;
301}
302
303int hexDigit(char digit) {
304 switch (digit) {
305 case '0':
306 case '1':
307 case '2':
308 case '3':
309 case '4':
310 case '5':
311 case '6':
312 case '7':
313 case '8':
314 case '9':
315 return digit - '0';
316
317 case 'a':
318 case 'b':
319 case 'c':
320 case 'd':
321 case 'e':
322 case 'f':
323 return digit - 'a' + 10;
324
325 case 'A':
326 case 'B':
327 case 'C':
328 case 'D':
329 case 'E':
330 case 'F':
331 return digit - 'A' + 10;
332
333 default:
334 return -1;
335 }
336}
337
338const char* hex32(const char* line, uint32_t* out) {
339 uint32_t value = 0;
340 int i;
341 for (i = 0; i < 8; ++i, ++line) {
342 char digit = *line;
343 value <<= 4;
344 int nybble = hexDigit(digit);
345 if (nybble < 0) {
346 return 0;
347 }
348 value |= nybble;
349 }
350 *out = value;
351 return line;
352}
353
354const char* hex24(const char* line, uint32_t* out) {
355 uint32_t value = 0;
356 int i;
357 for (i = 0; i < 6; ++i, ++line) {
358 char digit = *line;
359 value <<= 4;
360 int nybble = hexDigit(digit);
361 if (nybble < 0) {
362 return 0;
363 }
364 value |= nybble;
365 }
366 *out = value;
367 return line;
368}
369
370const char* hex16(const char* line, uint16_t* out) {
371 uint16_t value = 0;
372 *out = 0;
373 int i;
374 for (i = 0; i < 4; ++i, ++line) {
375 char digit = *line;
376 value <<= 4;
377 int nybble = hexDigit(digit);
378 if (nybble < 0) {
379 return 0;
380 }
381 value |= nybble;
382 }
383 *out = value;
384 return line;
385}
386
387const char* hex12(const char* line, uint16_t* out) {
388 uint16_t value = 0;
389 *out = 0;
390 int i;
391 for (i = 0; i < 3; ++i, ++line) {
392 char digit = *line;
393 value <<= 4;
394 int nybble = hexDigit(digit);
395 if (nybble < 0) {
396 return 0;
397 }
398 value |= nybble;
399 }
400 *out = value;
401 return line;
402}
403
404const char* hex8(const char* line, uint8_t* out) {
405 uint8_t value = 0;
406 *out = 0;
407 int i;
408 for (i = 0; i < 2; ++i, ++line) {
409 char digit = *line;
410 value <<= 4;
411 int nybble = hexDigit(digit);
412 if (nybble < 0) {
413 return 0;
414 }
415 value |= nybble;
416 }
417 *out = value;
418 return line;
419}
420
421const char* hex4(const char* line, uint8_t* out) {
422 uint8_t value = 0;
423 *out = 0;
424 char digit = *line;
425 int nybble = hexDigit(digit);
426 if (nybble < 0) {
427 return 0;
428 }
429 value |= nybble;
430 *out = value;
431 return line;
432}
433
434void rtrim(char* string) {
435 if (!*string) {
436 return;
437 }
438 char* end = string + strlen(string) - 1;
439 while (isspace((int) *end) && end >= string) {
440 *end = '\0';
441 --end;
442 }
443}
444
445ssize_t parseQuotedString(const char* unparsed, ssize_t unparsedLen, char* parsed, ssize_t parsedLen) {
446 memset(parsed, 0, parsedLen);
447 bool escaped = false;
448 char start = '\0';
449 ssize_t len = 0;
450 ssize_t i;
451 for (i = 0; i < unparsedLen && len < parsedLen; ++i) {
452 if (i == 0) {
453 switch (unparsed[0]) {
454 case '"':
455 case '\'':
456 start = unparsed[0];
457 break;
458 default:
459 return -1;
460 }
461 continue;
462 }
463 if (escaped) {
464 switch (unparsed[i]) {
465 case 'n':
466 parsed[len] = '\n';
467 break;
468 case 'r':
469 parsed[len] = '\r';
470 break;
471 case '\\':
472 parsed[len] = '\\';
473 break;
474 case '\'':
475 parsed[len] = '\'';
476 break;
477 case '"':
478 parsed[len] = '"';
479 break;
480 default:
481 return -1;
482 }
483 escaped = false;
484 ++len;
485 continue;
486 }
487 if (unparsed[i] == start) {
488 return len;
489 }
490 switch (unparsed[i]) {
491 case '\\':
492 escaped = true;
493 break;
494 case '\n':
495 case '\r':
496 return len;
497 default:
498 parsed[len] = unparsed[i];
499 ++len;
500 break;
501 }
502 }
503 return -1;
504}