src/util/string.c (view raw)
1/* Copyright (c) 2013-2015 Jeffrey Pfau
2 *
3 * This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6#include "util/string.h"
7
8#include <string.h>
9
10#ifndef HAVE_STRNDUP
11char* strndup(const char* start, size_t len) {
12 // This is suboptimal, but anything recent should have strndup
13 char* out = malloc((len + 1) * sizeof(char));
14 strncpy(out, start, len);
15 out[len] = '\0';
16 return out;
17}
18#endif
19
20#ifndef HAVE_STRDUP
21char* strdup(const char* str) {
22 size_t len = strlen(str);
23 char* out = malloc(len + 1);
24 strncpy(out, str, len);
25 out[len] = '\0';
26 return out;
27}
28#endif
29
30char* strnrstr(const char* restrict haystack, const char* restrict needle, size_t len) {
31 char* last = 0;
32 const char* next = haystack;
33 size_t needleLen = strlen(needle);
34 for (; len >= needleLen; --len, ++next) {
35 if (strncmp(needle, next, needleLen) == 0) {
36 last = (char*) next;
37 }
38 }
39 return last;
40}
41
42bool endswith(const char* restrict s1, const char* restrict end) {
43 size_t len = strlen(s1);
44 size_t endLen = strlen(end);
45 if (len < endLen) {
46 return false;
47 }
48 return strcmp(&s1[len - endLen], end) == 0;
49}
50
51uint32_t utf16Char(const uint16_t** unicode, size_t* length) {
52 if (*length < 2) {
53 *length = 0;
54 return 0;
55 }
56 uint32_t unichar = **unicode;
57 ++*unicode;
58 *length -= 2;
59 if (unichar < 0xD800 || unichar >= 0xE000) {
60 return unichar;
61 }
62 if (*length < 2) {
63 *length = 0;
64 return 0;
65 }
66 uint16_t highSurrogate = unichar;
67 uint16_t lowSurrogate = **unicode;
68 ++*unicode;
69 *length -= 2;
70 if (highSurrogate >= 0xDC00) {
71 return 0;
72 }
73 if (lowSurrogate < 0xDC00 || lowSurrogate >= 0xE000) {
74 return 0;
75 }
76 highSurrogate -= 0xD800;
77 lowSurrogate -= 0xDC00;
78 return (highSurrogate << 10) + lowSurrogate + 0x10000;
79}
80
81uint32_t utf8Char(const char** unicode, size_t* length) {
82 if (*length == 0) {
83 return 0;
84 }
85 char byte = **unicode;
86 --*length;
87 ++*unicode;
88 if (!(byte & 0x80)) {
89 return byte;
90 }
91 uint32_t unichar;
92 static const int tops[4] = { 0xC0, 0xE0, 0xF0, 0xF8 };
93 size_t numBytes;
94 for (numBytes = 0; numBytes < 3; ++numBytes) {
95 if ((byte & tops[numBytes + 1]) == tops[numBytes]) {
96 break;
97 }
98 }
99 unichar = byte & ~tops[numBytes];
100 if (numBytes == 3) {
101 return 0;
102 }
103 ++numBytes;
104 if (*length < numBytes) {
105 *length = 0;
106 return 0;
107 }
108 size_t i;
109 for (i = 0; i < numBytes; ++i) {
110 unichar <<= 6;
111 byte = **unicode;
112 --*length;
113 ++*unicode;
114 if ((byte & 0xC0) != 0x80) {
115 return 0;
116 }
117 unichar |= byte & 0x3F;
118 }
119 return unichar;
120}
121
122size_t toUtf8(uint32_t unichar, char* buffer) {
123 if (unichar > 0x10FFFF) {
124 unichar = 0xFFFD;
125 }
126 if (unichar < 0x80) {
127 buffer[0] = unichar;
128 return 1;
129 }
130 if (unichar < 0x800) {
131 buffer[0] = (unichar >> 6) | 0xC0;
132 buffer[1] = (unichar & 0x3F) | 0x80;
133 return 2;
134 }
135 if (unichar < 0x10000) {
136 buffer[0] = (unichar >> 12) | 0xE0;
137 buffer[1] = ((unichar >> 6) & 0x3F) | 0x80;
138 buffer[2] = (unichar & 0x3F) | 0x80;
139 return 3;
140 }
141 if (unichar < 0x200000) {
142 buffer[0] = (unichar >> 18) | 0xF0;
143 buffer[1] = ((unichar >> 12) & 0x3F) | 0x80;
144 buffer[2] = ((unichar >> 6) & 0x3F) | 0x80;
145 buffer[3] = (unichar & 0x3F) | 0x80;
146 return 4;
147 }
148
149 // This shouldn't be possible
150 return 0;
151}
152
153int utfcmp(const uint16_t* utf16, const char* utf8, size_t utf16Length, size_t utf8Length) {
154 uint32_t char1 = 0, char2 = 0;
155 while (utf16Length > 0 && utf8Length > 0) {
156 if (char1 < char2) {
157 return -1;
158 }
159 if (char1 > char2) {
160 return 1;
161 }
162 char1 = utf16Char(&utf16, &utf16Length);
163 char2 = utf8Char(&utf8, &utf8Length);
164 }
165 if (utf16Length == 0 && utf8Length > 0) {
166 return -1;
167 }
168 if (utf16Length > 0 && utf8Length == 0) {
169 return 1;
170 }
171 return 0;
172}
173
174char* utf16to8(const uint16_t* utf16, size_t length) {
175 char* utf8 = 0;
176 char* offset = 0;
177 char buffer[4];
178 size_t utf8TotalBytes = 0;
179 size_t utf8Length = 0;
180 while (true) {
181 if (length == 0) {
182 break;
183 }
184 uint32_t unichar = utf16Char(&utf16, &length);
185 size_t bytes = toUtf8(unichar, buffer);
186 utf8Length += bytes;
187 if (utf8Length < utf8TotalBytes) {
188 memcpy(offset, buffer, bytes);
189 offset += bytes;
190 } else if (!utf8) {
191 utf8 = malloc(length);
192 if (!utf8) {
193 return 0;
194 }
195 utf8TotalBytes = length;
196 memcpy(utf8, buffer, bytes);
197 offset = utf8 + bytes;
198 } else if (utf8Length >= utf8TotalBytes) {
199 ptrdiff_t o = offset - utf8;
200 char* newUTF8 = realloc(utf8, utf8TotalBytes * 2);
201 offset = o + newUTF8;
202 if (!newUTF8) {
203 free(utf8);
204 return 0;
205 }
206 utf8 = newUTF8;
207 memcpy(offset, buffer, bytes);
208 offset += bytes;
209 }
210 }
211
212 char* newUTF8 = realloc(utf8, utf8Length + 1);
213 if (!newUTF8) {
214 free(utf8);
215 return 0;
216 }
217 newUTF8[utf8Length] = '\0';
218 return newUTF8;
219}
220
221int hexDigit(char digit) {
222 switch (digit) {
223 case '0':
224 case '1':
225 case '2':
226 case '3':
227 case '4':
228 case '5':
229 case '6':
230 case '7':
231 case '8':
232 case '9':
233 return digit - '0';
234
235 case 'a':
236 case 'b':
237 case 'c':
238 case 'd':
239 case 'e':
240 case 'f':
241 return digit - 'a' + 10;
242
243 case 'A':
244 case 'B':
245 case 'C':
246 case 'D':
247 case 'E':
248 case 'F':
249 return digit - 'A' + 10;
250
251 default:
252 return -1;
253 }
254}
255
256const char* hex32(const char* line, uint32_t* out) {
257 uint32_t value = 0;
258 int i;
259 for (i = 0; i < 8; ++i, ++line) {
260 char digit = *line;
261 value <<= 4;
262 int nybble = hexDigit(digit);
263 if (nybble < 0) {
264 return 0;
265 }
266 value |= nybble;
267 }
268 *out = value;
269 return line;
270}
271
272const char* hex24(const char* line, uint32_t* out) {
273 uint32_t value = 0;
274 int i;
275 for (i = 0; i < 6; ++i, ++line) {
276 char digit = *line;
277 value <<= 4;
278 int nybble = hexDigit(digit);
279 if (nybble < 0) {
280 return 0;
281 }
282 value |= nybble;
283 }
284 *out = value;
285 return line;
286}
287
288const char* hex16(const char* line, uint16_t* out) {
289 uint16_t value = 0;
290 *out = 0;
291 int i;
292 for (i = 0; i < 4; ++i, ++line) {
293 char digit = *line;
294 value <<= 4;
295 int nybble = hexDigit(digit);
296 if (nybble < 0) {
297 return 0;
298 }
299 value |= nybble;
300 }
301 *out = value;
302 return line;
303}
304
305const char* hex12(const char* line, uint16_t* out) {
306 uint16_t value = 0;
307 *out = 0;
308 int i;
309 for (i = 0; i < 3; ++i, ++line) {
310 char digit = *line;
311 value <<= 4;
312 int nybble = hexDigit(digit);
313 if (nybble < 0) {
314 return 0;
315 }
316 value |= nybble;
317 }
318 *out = value;
319 return line;
320}
321
322const char* hex8(const char* line, uint8_t* out) {
323 uint8_t value = 0;
324 *out = 0;
325 int i;
326 for (i = 0; i < 2; ++i, ++line) {
327 char digit = *line;
328 value <<= 4;
329 int nybble = hexDigit(digit);
330 if (nybble < 0) {
331 return 0;
332 }
333 value |= nybble;
334 }
335 *out = value;
336 return line;
337}
338
339const char* hex4(const char* line, uint8_t* out) {
340 uint8_t value = 0;
341 *out = 0;
342 char digit = *line;
343 value <<= 4;
344 int nybble = hexDigit(digit);
345 if (nybble < 0) {
346 return 0;
347 }
348 value |= nybble;
349 *out = value;
350 return line;
351}
352
353void rtrim(char* string) {
354 if (!*string) {
355 return;
356 }
357 char* end = string + strlen(string) - 1;
358 while (isspace((int) *end) && end >= string) {
359 *end = '\0';
360 --end;
361 }
362}