all repos — mgba @ dab12cf5c674542cae0db7708c333035255fbc65

mGBA Game Boy Advance Emulator

src/util/string.c (view raw)

  1/* Copyright (c) 2013-2019 Jeffrey Pfau
  2 *
  3 * This Source Code Form is subject to the terms of the Mozilla Public
  4 * License, v. 2.0. If a copy of the MPL was not distributed with this
  5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
  6#include <mgba-util/string.h>
  7
  8#include <mgba-util/vector.h>
  9
 10#include <string.h>
 11
 12DEFINE_VECTOR(StringList, char*);
 13
 14#ifndef HAVE_STRNDUP
 15char* strndup(const char* start, size_t len) {
 16	// This is suboptimal, but anything recent should have strndup
 17	char* out = malloc((len + 1) * sizeof(char));
 18	strncpy(out, start, len);
 19	out[len] = '\0';
 20	return out;
 21}
 22#endif
 23
 24#ifndef HAVE_STRDUP
 25char* strdup(const char* str) {
 26	size_t len = strlen(str);
 27	char* out = malloc(len + 1);
 28	strncpy(out, str, len);
 29	out[len] = '\0';
 30	return out;
 31}
 32#endif
 33
 34char* strnrstr(const char* restrict haystack, const char* restrict needle, size_t len) {
 35	char* last = 0;
 36	const char* next = haystack;
 37	size_t needleLen = strlen(needle);
 38	for (; len >= needleLen; --len, ++next) {
 39		if (strncmp(needle, next, needleLen) == 0) {
 40			last = (char*) next;
 41		}
 42	}
 43	return last;
 44}
 45
 46bool endswith(const char* restrict s1, const char* restrict end) {
 47	size_t len = strlen(s1);
 48	size_t endLen = strlen(end);
 49	if (len < endLen) {
 50		return false;
 51	}
 52	return strcmp(&s1[len - endLen], end) == 0;
 53}
 54
 55bool startswith(const char* restrict s1, const char* restrict start) {
 56	size_t len = strlen(s1);
 57	size_t startLen = strlen(start);
 58	if (len < startLen) {
 59		return false;
 60	}
 61	return strncmp(s1, start, startLen) == 0;
 62}
 63
 64uint32_t utf16Char(const uint16_t** unicode, size_t* length) {
 65	if (*length < 2) {
 66		*length = 0;
 67		return 0;
 68	}
 69	uint32_t unichar = **unicode;
 70	++*unicode;
 71	*length -= 2;
 72	if (unichar < 0xD800 || unichar >= 0xE000) {
 73		return unichar;
 74	}
 75	if (*length < 2) {
 76		*length = 0;
 77		return 0;
 78	}
 79	uint16_t highSurrogate = unichar;
 80	uint16_t lowSurrogate = **unicode;
 81	++*unicode;
 82	*length -= 2;
 83	if (highSurrogate >= 0xDC00) {
 84		return 0;
 85	}
 86	if (lowSurrogate < 0xDC00 || lowSurrogate >= 0xE000) {
 87		return 0;
 88	}
 89	highSurrogate -= 0xD800;
 90	lowSurrogate -= 0xDC00;
 91	return (highSurrogate << 10) + lowSurrogate + 0x10000;
 92}
 93
 94uint32_t utf8Char(const char** unicode, size_t* length) {
 95	if (*length == 0) {
 96		return 0;
 97	}
 98	char byte = **unicode;
 99	--*length;
100	++*unicode;
101	if (!(byte & 0x80)) {
102		return byte;
103	}
104	uint32_t unichar;
105	static const int tops[4] = { 0xC0, 0xE0, 0xF0, 0xF8 };
106	size_t numBytes;
107	for (numBytes = 0; numBytes < 3; ++numBytes) {
108		if ((byte & tops[numBytes + 1]) == tops[numBytes]) {
109			break;
110		}
111	}
112	unichar = byte & ~tops[numBytes];
113	if (numBytes == 3) {
114		return 0;
115	}
116	++numBytes;
117	if (*length < numBytes) {
118		*length = 0;
119		return 0;
120	}
121	size_t i;
122	for (i = 0; i < numBytes; ++i) {
123		unichar <<= 6;
124		byte = **unicode;
125		--*length;
126		++*unicode;
127		if ((byte & 0xC0) != 0x80) {
128			return 0;
129		}
130		unichar |= byte & 0x3F;
131	}
132	return unichar;
133}
134
135size_t toUtf8(uint32_t unichar, char* buffer) {
136	if (unichar > 0x10FFFF) {
137		unichar = 0xFFFD;
138	}
139	if (unichar < 0x80) {
140		buffer[0] = unichar;
141		return 1;
142	}
143	if (unichar < 0x800) {
144		buffer[0] = (unichar >> 6) | 0xC0;
145		buffer[1] = (unichar & 0x3F) | 0x80;
146		return 2;
147	}
148	if (unichar < 0x10000) {
149		buffer[0] = (unichar >> 12) | 0xE0;
150		buffer[1] = ((unichar >> 6) & 0x3F) | 0x80;
151		buffer[2] = (unichar & 0x3F) | 0x80;
152		return 3;
153	}
154	if (unichar < 0x200000) {
155		buffer[0] = (unichar >> 18) | 0xF0;
156		buffer[1] = ((unichar >> 12) & 0x3F) | 0x80;
157		buffer[2] = ((unichar >> 6) & 0x3F) | 0x80;
158		buffer[3] = (unichar & 0x3F) | 0x80;
159		return 4;
160	}
161
162	// This shouldn't be possible
163	return 0;
164}
165
166int utfcmp(const uint16_t* utf16, const char* utf8, size_t utf16Length, size_t utf8Length) {
167	uint32_t char1 = 0, char2 = 0;
168	while (utf16Length > 0 && utf8Length > 0) {
169		if (char1 < char2) {
170			return -1;
171		}
172		if (char1 > char2) {
173			return 1;
174		}
175		char1 = utf16Char(&utf16, &utf16Length);
176		char2 = utf8Char(&utf8, &utf8Length);
177	}
178	if (utf16Length == 0 && utf8Length > 0) {
179		return -1;
180	}
181	if (utf16Length > 0 && utf8Length == 0) {
182		return 1;
183	}
184	return 0;
185}
186
187char* utf16to8(const uint16_t* utf16, size_t length) {
188	char* utf8 = 0;
189	char* offset = 0;
190	char buffer[4];
191	size_t utf8TotalBytes = 0;
192	size_t utf8Length = 0;
193	while (true) {
194		if (length == 0) {
195			break;
196		}
197		uint32_t unichar = utf16Char(&utf16, &length);
198		size_t bytes = toUtf8(unichar, buffer);
199		utf8Length += bytes;
200		if (utf8Length < utf8TotalBytes) {
201			memcpy(offset, buffer, bytes);
202			offset += bytes;
203		} else if (!utf8) {
204			utf8 = malloc(length);
205			if (!utf8) {
206				return 0;
207			}
208			utf8TotalBytes = length;
209			memcpy(utf8, buffer, bytes);
210			offset = utf8 + bytes;
211		} else if (utf8Length >= utf8TotalBytes) {
212			ptrdiff_t o = offset - utf8;
213			char* newUTF8 = realloc(utf8, utf8TotalBytes * 2);
214			offset = o + newUTF8;
215			if (!newUTF8) {
216				free(utf8);
217				return 0;
218			}
219			utf8 = newUTF8;
220			memcpy(offset, buffer, bytes);
221			offset += bytes;
222		}
223	}
224
225	char* newUTF8 = realloc(utf8, utf8Length + 1);
226	if (!newUTF8) {
227		free(utf8);
228		return 0;
229	}
230	newUTF8[utf8Length] = '\0';
231	return newUTF8;
232}
233
234extern const uint16_t gbkUnicodeTable[];
235
236char* gbkToUtf8(const char* gbk, size_t length) {
237	char* utf8 = NULL;
238	char* utf8Offset = NULL;
239	size_t offset;
240	uint8_t gbk1 = 0;
241	char buffer[4];
242	size_t utf8TotalBytes = 0;
243	size_t utf8Length = 0;
244	for (offset = 0; offset < length; ++offset) {
245		if (length == 0) {
246			break;
247		}
248		unsigned unichar = 0xFFFD;
249		if (!gbk1 && !(gbk[offset] & 0x80)) {
250			unichar = gbk[offset];
251		} else if (gbk1) {
252			uint8_t gbk2 = gbk[offset];
253			if (gbk2 >= 0x40 && gbk2 != 0xFF) {
254				// TODO: GB-18030 support?
255				unichar = gbkUnicodeTable[gbk1 * 0xBF + gbk2 - 0x40];
256			}
257			gbk1 = 0;
258		} else if (((uint8_t*) gbk)[offset] == 0xFF) {
259			unichar = 0xFFFD;
260		} else if (((uint8_t*) gbk)[offset] == 0x80) {
261			unichar = 0x20AC; // Euro
262		} else {
263			gbk1 = ((uint8_t*) gbk)[offset] - 0x81;
264			continue;
265		}
266
267		size_t bytes = toUtf8(unichar, buffer);
268		utf8Length += bytes;
269		if (!utf8) {
270			utf8 = malloc(length);
271			if (!utf8) {
272				return NULL;
273			}
274			utf8TotalBytes = length;
275			memcpy(utf8, buffer, bytes);
276			utf8Offset = utf8 + bytes;
277		} else if (utf8Length < utf8TotalBytes) {
278			memcpy(utf8Offset, buffer, bytes);
279			utf8Offset += bytes;
280		} else if (utf8Length >= utf8TotalBytes) {
281			ptrdiff_t o = utf8Offset - utf8;
282			char* newUTF8 = realloc(utf8, utf8TotalBytes * 2);
283			utf8Offset = o + newUTF8;
284			if (!newUTF8) {
285				free(utf8);
286				return 0;
287			}
288			utf8 = newUTF8;
289			memcpy(utf8Offset, buffer, bytes);
290			utf8Offset += bytes;
291		}
292	}
293
294	char* newUTF8 = realloc(utf8, utf8Length + 1);
295	if (!newUTF8) {
296		free(utf8);
297		return 0;
298	}
299	newUTF8[utf8Length] = '\0';
300	return newUTF8;
301}
302
303int hexDigit(char digit) {
304	switch (digit) {
305	case '0':
306	case '1':
307	case '2':
308	case '3':
309	case '4':
310	case '5':
311	case '6':
312	case '7':
313	case '8':
314	case '9':
315		return digit - '0';
316
317	case 'a':
318	case 'b':
319	case 'c':
320	case 'd':
321	case 'e':
322	case 'f':
323		return digit - 'a' + 10;
324
325	case 'A':
326	case 'B':
327	case 'C':
328	case 'D':
329	case 'E':
330	case 'F':
331		return digit - 'A' + 10;
332
333	default:
334		return -1;
335	}
336}
337
338const char* hex32(const char* line, uint32_t* out) {
339	uint32_t value = 0;
340	int i;
341	for (i = 0; i < 8; ++i, ++line) {
342		char digit = *line;
343		value <<= 4;
344		int nybble = hexDigit(digit);
345		if (nybble < 0) {
346			return 0;
347		}
348		value |= nybble;
349	}
350	*out = value;
351	return line;
352}
353
354const char* hex24(const char* line, uint32_t* out) {
355	uint32_t value = 0;
356	int i;
357	for (i = 0; i < 6; ++i, ++line) {
358		char digit = *line;
359		value <<= 4;
360		int nybble = hexDigit(digit);
361		if (nybble < 0) {
362			return 0;
363		}
364		value |= nybble;
365	}
366	*out = value;
367	return line;
368}
369
370const char* hex16(const char* line, uint16_t* out) {
371	uint16_t value = 0;
372	*out = 0;
373	int i;
374	for (i = 0; i < 4; ++i, ++line) {
375		char digit = *line;
376		value <<= 4;
377		int nybble = hexDigit(digit);
378		if (nybble < 0) {
379			return 0;
380		}
381		value |= nybble;
382	}
383	*out = value;
384	return line;
385}
386
387const char* hex12(const char* line, uint16_t* out) {
388	uint16_t value = 0;
389	*out = 0;
390	int i;
391	for (i = 0; i < 3; ++i, ++line) {
392		char digit = *line;
393		value <<= 4;
394		int nybble = hexDigit(digit);
395		if (nybble < 0) {
396			return 0;
397		}
398		value |= nybble;
399	}
400	*out = value;
401	return line;
402}
403
404const char* hex8(const char* line, uint8_t* out) {
405	uint8_t value = 0;
406	*out = 0;
407	int i;
408	for (i = 0; i < 2; ++i, ++line) {
409		char digit = *line;
410		value <<= 4;
411		int nybble = hexDigit(digit);
412		if (nybble < 0) {
413			return 0;
414		}
415		value |= nybble;
416	}
417	*out = value;
418	return line;
419}
420
421const char* hex4(const char* line, uint8_t* out) {
422	uint8_t value = 0;
423	*out = 0;
424	char digit = *line;
425	int nybble = hexDigit(digit);
426	if (nybble < 0) {
427		return 0;
428	}
429	value |= nybble;
430	*out = value;
431	return line;
432}
433
434void rtrim(char* string) {
435	if (!*string) {
436		return;
437	}
438	char* end = string + strlen(string) - 1;
439	while (isspace((int) *end) && end >= string) {
440		*end = '\0';
441		--end;
442	}
443}
444
445ssize_t parseQuotedString(const char* unparsed, ssize_t unparsedLen, char* parsed, ssize_t parsedLen) {
446	memset(parsed, 0, parsedLen);
447	bool escaped = false;
448	char start = '\0';
449	ssize_t len = 0;
450	ssize_t i;
451	for (i = 0; i < unparsedLen && len < parsedLen; ++i) {
452		if (i == 0) {
453			switch (unparsed[0]) {
454			case '"':
455			case '\'':
456				start = unparsed[0];
457				break;
458			default:
459				return -1;
460			}
461			continue;
462		}
463		if (escaped) {
464			switch (unparsed[i]) {
465			case 'n':
466				parsed[len] = '\n';
467				break;
468			case 'r':
469				parsed[len] = '\r';
470				break;
471			case '\\':
472				parsed[len] = '\\';
473				break;
474			case '\'':
475				parsed[len] = '\'';
476				break;
477			case '"':
478				parsed[len] = '"';
479				break;
480			default:
481				return -1;
482			}
483			escaped = false;
484			++len;
485			continue;
486		}
487		if (unparsed[i] == start) {
488			return len;
489		}
490		switch (unparsed[i]) {
491		case '\\':
492			escaped = true;
493			break;
494		case '\n':
495		case '\r':
496			return len;
497		default:
498			parsed[len] = unparsed[i];
499			++len;
500			break;
501		}
502	}
503	return -1;
504}