src/util/string.c (view raw)
1/* Copyright (c) 2013-2015 Jeffrey Pfau
2 *
3 * This Source Code Form is subject to the terms of the Mozilla Public
4 * License, v. 2.0. If a copy of the MPL was not distributed with this
5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6#include "util/string.h"
7
8#include <string.h>
9
10#ifndef HAVE_STRNDUP
11char* strndup(const char* start, size_t len) {
12 // This is suboptimal, but anything recent should have strndup
13 char* out = malloc((len + 1) * sizeof(char));
14 strncpy(out, start, len);
15 out[len] = '\0';
16 return out;
17}
18#endif
19
20char* strnrstr(const char* restrict haystack, const char* restrict needle, size_t len) {
21 char* last = 0;
22 const char* next = haystack;
23 size_t needleLen = strlen(needle);
24 for (; len >= needleLen; --len, ++next) {
25 if (strncmp(needle, next, needleLen) == 0) {
26 last = (char*) next;
27 }
28 }
29 return last;
30}
31
32static uint32_t _utf16Char(const uint16_t** unicode, size_t* length) {
33 if (*length < 2) {
34 *length = 0;
35 return 0;
36 }
37 uint32_t unichar = **unicode;
38 ++*unicode;
39 *length -= 2;
40 if (unichar < 0xD800 || unichar >= 0xE000) {
41 return unichar;
42 }
43 if (*length < 2) {
44 *length = 0;
45 return 0;
46 }
47 uint16_t highSurrogate = unichar;
48 uint16_t lowSurrogate = **unicode;
49 ++*unicode;
50 *length -= 2;
51 if (highSurrogate >= 0xDC00) {
52 return 0;
53 }
54 if (lowSurrogate < 0xDC00 || lowSurrogate >= 0xE000) {
55 return 0;
56 }
57 highSurrogate -= 0xD800;
58 lowSurrogate -= 0xDC00;
59 return (highSurrogate << 10) + lowSurrogate + 0x10000;
60}
61
62static uint32_t _utf8Char(const char** unicode, size_t* length) {
63 if (*length == 0) {
64 return 0;
65 }
66 char byte = **unicode;
67 --*length;
68 ++*unicode;
69 if (!(byte & 0x80)) {
70 return byte;
71 }
72 uint32_t unichar;
73 static int tops[4] = { 0xC0, 0xE0, 0xF0, 0xF8 };
74 size_t numBytes;
75 for (numBytes = 0; numBytes < 3; ++numBytes) {
76 if ((byte & tops[numBytes + 1]) == tops[numBytes]) {
77 break;
78 }
79 }
80 unichar = byte & ~tops[numBytes];
81 if (numBytes == 3) {
82 return 0;
83 }
84 ++numBytes;
85 if (*length < numBytes) {
86 *length = 0;
87 return 0;
88 }
89 size_t i;
90 for (i = 0; i < numBytes; ++i) {
91 unichar <<= 6;
92 byte = **unicode;
93 --*length;
94 ++*unicode;
95 if ((byte & 0xC0) != 0x80) {
96 return 0;
97 }
98 unichar |= byte & 0x3F;
99 }
100 return unichar;
101}
102
103static size_t _toUtf8(uint32_t unichar, char* buffer) {
104 if (unichar > 0x10FFFF) {
105 unichar = 0xFFFD;
106 }
107 if (unichar < 0x80) {
108 buffer[0] = unichar;
109 return 1;
110 }
111 if (unichar < 0x800) {
112 buffer[0] = (unichar >> 6) | 0xC0;
113 buffer[1] = (unichar & 0x3F) | 0x80;
114 return 2;
115 }
116 if (unichar < 0x10000) {
117 buffer[0] = (unichar >> 12) | 0xE0;
118 buffer[1] = ((unichar >> 6) & 0x3F) | 0x80;
119 buffer[2] = (unichar & 0x3F) | 0x80;
120 return 3;
121 }
122 if (unichar < 0x200000) {
123 buffer[0] = (unichar >> 18) | 0xF0;
124 buffer[1] = ((unichar >> 12) & 0x3F) | 0x80;
125 buffer[2] = ((unichar >> 6) & 0x3F) | 0x80;
126 buffer[3] = (unichar & 0x3F) | 0x80;
127 return 4;
128 }
129
130 // This shouldn't be possible
131 return 0;
132}
133
134int utfcmp(const uint16_t* utf16, const char* utf8, size_t utf16Length, size_t utf8Length) {
135 uint32_t char1 = 0, char2 = 0;
136 while (utf16Length > 0 && utf8Length > 0) {
137 if (char1 < char2) {
138 return -1;
139 }
140 if (char1 > char2) {
141 return 1;
142 }
143 char1 = _utf16Char(&utf16, &utf16Length);
144 char2 = _utf8Char(&utf8, &utf8Length);
145 }
146 if (utf16Length == 0 && utf8Length > 0) {
147 return -1;
148 }
149 if (utf16Length > 0 && utf8Length == 0) {
150 return 1;
151 }
152 return 0;
153}
154
155char* utf16to8(const uint16_t* utf16, size_t length) {
156 char* utf8 = 0;
157 char* offset = 0;
158 char buffer[4];
159 size_t utf8TotalBytes = 0;
160 size_t utf8Length = 0;
161 while (true) {
162 if (length == 0) {
163 break;
164 }
165 uint32_t unichar = _utf16Char(&utf16, &length);
166 size_t bytes = _toUtf8(unichar, buffer);
167 utf8Length += bytes;
168 if (utf8Length < utf8TotalBytes) {
169 memcpy(offset, buffer, bytes);
170 offset += bytes;
171 } else if (!utf8) {
172 utf8 = malloc(length);
173 if (!utf8) {
174 return 0;
175 }
176 utf8TotalBytes = length;
177 memcpy(utf8, buffer, bytes);
178 offset = utf8 + bytes;
179 } else if (utf8Length >= utf8TotalBytes) {
180 char* newUTF8 = realloc(utf8, utf8TotalBytes * 2);
181 offset = offset - utf8 + newUTF8;
182 if (newUTF8 != utf8) {
183 free(utf8);
184 }
185 if (!newUTF8) {
186 return 0;
187 }
188 utf8 = newUTF8;
189 memcpy(offset, buffer, bytes);
190 offset += bytes;
191 }
192 }
193
194 char* newUTF8 = realloc(utf8, utf8Length + 1);
195 if (newUTF8 != utf8) {
196 free(utf8);
197 }
198 newUTF8[utf8Length] = '\0';
199 return newUTF8;
200}
201
202int hexDigit(char digit) {
203 switch (digit) {
204 case '0':
205 case '1':
206 case '2':
207 case '3':
208 case '4':
209 case '5':
210 case '6':
211 case '7':
212 case '8':
213 case '9':
214 return digit - '0';
215
216 case 'a':
217 case 'b':
218 case 'c':
219 case 'd':
220 case 'e':
221 case 'f':
222 return digit - 'a' + 10;
223
224 case 'A':
225 case 'B':
226 case 'C':
227 case 'D':
228 case 'E':
229 case 'F':
230 return digit - 'A' + 10;
231
232 default:
233 return -1;
234 }
235}
236
237const char* hex32(const char* line, uint32_t* out) {
238 uint32_t value = 0;
239 int i;
240 for (i = 0; i < 8; ++i, ++line) {
241 char digit = *line;
242 value <<= 4;
243 int nybble = hexDigit(digit);
244 if (nybble < 0) {
245 return 0;
246 }
247 value |= nybble;
248 }
249 *out = value;
250 return line;
251}
252
253const char* hex16(const char* line, uint16_t* out) {
254 uint16_t value = 0;
255 *out = 0;
256 int i;
257 for (i = 0; i < 4; ++i, ++line) {
258 char digit = *line;
259 value <<= 4;
260 int nybble = hexDigit(digit);
261 if (nybble < 0) {
262 return 0;
263 }
264 value |= nybble;
265 }
266 *out = value;
267 return line;
268}