VirtualBox

source: vbox/trunk/src/VBox/Runtime/common/string/utf-8.cpp@ 31418

Last change on this file since 31418 was 31418, checked in by vboxsync, 15 years ago

iprt/string.h,utf-8.cpp: avoid including uni.h; misc nits.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 53.7 KB
Line 
1/* $Id: utf-8.cpp 31418 2010-08-05 17:37:13Z vboxsync $ */
2/** @file
3 * IPRT - UTF-8 Decoding.
4 */
5
6/*
7 * Copyright (C) 2006-2010 Oracle Corporation
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.215389.xyz. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 *
17 * The contents of this file may alternatively be used under the terms
18 * of the Common Development and Distribution License Version 1.0
19 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20 * VirtualBox OSE distribution, in which case the provisions of the
21 * CDDL are applicable instead of those of the GPL.
22 *
23 * You may elect to license modified versions of this file under the
24 * terms and conditions of either the GPL or the CDDL or both.
25 */
26
27
28/*******************************************************************************
29* Header Files *
30*******************************************************************************/
31#include <iprt/string.h>
32#include "internal/iprt.h"
33
34#include <iprt/uni.h>
35#include <iprt/alloc.h>
36#include <iprt/assert.h>
37#include <iprt/err.h>
38#include "internal/string.h"
39
40
41
42/**
43 * Get get length in code points of a UTF-8 encoded string.
44 * The string is validated while doing this.
45 *
46 * @returns IPRT status code.
47 * @param psz Pointer to the UTF-8 string.
48 * @param cch The max length of the string. (btw cch = cb)
49 * Use RTSTR_MAX if all of the string is to be examined.
50 * @param pcuc Where to store the length in unicode code points.
51 * @param pcchActual Where to store the actual size of the UTF-8 string
52 * on success (cch = cb again). Optional.
53 */
54int rtUtf8Length(const char *psz, size_t cch, size_t *pcuc, size_t *pcchActual)
55{
56 const unsigned char *puch = (const unsigned char *)psz;
57 size_t cCodePoints = 0;
58 while (cch > 0)
59 {
60 const unsigned char uch = *puch;
61 if (!uch)
62 break;
63 if (uch & RT_BIT(7))
64 {
65 /* figure sequence length and validate the first byte */
66/** @todo RT_USE_RTC_3629 */
67 unsigned cb;
68 if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
69 cb = 2;
70 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))
71 cb = 3;
72 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)))
73 cb = 4;
74 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3)))
75 cb = 5;
76 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2) | RT_BIT(1))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2)))
77 cb = 6;
78 else
79 {
80 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
81 return VERR_INVALID_UTF8_ENCODING;
82 }
83
84 /* check length */
85 if (cb > cch)
86 {
87 RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
88 return VERR_INVALID_UTF8_ENCODING;
89 }
90
91 /* validate the rest */
92 switch (cb)
93 {
94 case 6:
95 RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
96 case 5:
97 RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
98 case 4:
99 RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
100 case 3:
101 RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
102 case 2:
103 RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
104 break;
105 }
106
107 /* validate the code point. */
108 RTUNICP uc;
109 switch (cb)
110 {
111 case 6:
112 uc = (puch[5] & 0x3f)
113 | ((RTUNICP)(puch[4] & 0x3f) << 6)
114 | ((RTUNICP)(puch[3] & 0x3f) << 12)
115 | ((RTUNICP)(puch[2] & 0x3f) << 18)
116 | ((RTUNICP)(puch[1] & 0x3f) << 24)
117 | ((RTUNICP)(uch & 0x01) << 30);
118 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
119 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
120 break;
121 case 5:
122 uc = (puch[4] & 0x3f)
123 | ((RTUNICP)(puch[3] & 0x3f) << 6)
124 | ((RTUNICP)(puch[2] & 0x3f) << 12)
125 | ((RTUNICP)(puch[1] & 0x3f) << 18)
126 | ((RTUNICP)(uch & 0x03) << 24);
127 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
128 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
129 break;
130 case 4:
131 uc = (puch[3] & 0x3f)
132 | ((RTUNICP)(puch[2] & 0x3f) << 6)
133 | ((RTUNICP)(puch[1] & 0x3f) << 12)
134 | ((RTUNICP)(uch & 0x07) << 18);
135 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
136 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
137 break;
138 case 3:
139 uc = (puch[2] & 0x3f)
140 | ((RTUNICP)(puch[1] & 0x3f) << 6)
141 | ((RTUNICP)(uch & 0x0f) << 12);
142 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
143 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
144 uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
145 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
146 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
147 break;
148 case 2:
149 uc = (puch[1] & 0x3f)
150 | ((RTUNICP)(uch & 0x1f) << 6);
151 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
152 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
153 break;
154 }
155
156 /* advance */
157 cch -= cb;
158 puch += cb;
159 }
160 else
161 {
162 /* one ASCII byte */
163 puch++;
164 cch--;
165 }
166 cCodePoints++;
167 }
168
169 /* done */
170 *pcuc = cCodePoints;
171 if (pcchActual)
172 *pcchActual = puch - (unsigned char const *)psz;
173 return VINF_SUCCESS;
174}
175
176
177/**
178 * Decodes and UTF-8 string into an array of unicode code point.
179 *
180 * Since we know the input is valid, we do *not* perform encoding or length checks.
181 *
182 * @returns iprt status code.
183 * @param psz The UTF-8 string to recode. This is a valid encoding.
184 * @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
185 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
186 * @param paCps Where to store the code points array.
187 * @param cCps The number of RTUNICP items the paCps buffer can hold, excluding the terminator ('\\0').
188 */
189static int rtUtf8Decode(const char *psz, size_t cch, PRTUNICP paCps, size_t cCps)
190{
191 int rc = VINF_SUCCESS;
192 const unsigned char *puch = (const unsigned char *)psz;
193 PRTUNICP pCp = paCps;
194 while (cch > 0)
195 {
196 /* read the next char and check for terminator. */
197 const unsigned char uch = *puch;
198 if (!uch)
199 break;
200
201 /* check for output overflow */
202 if (RT_UNLIKELY(cCps < 1))
203 {
204 rc = VERR_BUFFER_OVERFLOW;
205 break;
206 }
207 cCps--;
208
209 /* decode and recode the code point */
210 if (!(uch & RT_BIT(7)))
211 {
212 *pCp++ = uch;
213 puch++;
214 cch--;
215 }
216#ifdef RT_STRICT
217 else if (!(uch & RT_BIT(6)))
218 AssertMsgFailed(("Internal error!\n"));
219#endif
220 else if (!(uch & RT_BIT(5)))
221 {
222 *pCp++ = (puch[1] & 0x3f)
223 | ((uint16_t)(uch & 0x1f) << 6);
224 puch += 2;
225 cch -= 2;
226 }
227 else if (!(uch & RT_BIT(4)))
228 {
229 *pCp++ = (puch[2] & 0x3f)
230 | ((uint16_t)(puch[1] & 0x3f) << 6)
231 | ((uint16_t)(uch & 0x0f) << 12);
232 puch += 3;
233 cch -= 3;
234 }
235 else if (!(uch & RT_BIT(3)))
236 {
237 *pCp++ = (puch[3] & 0x3f)
238 | ((RTUNICP)(puch[2] & 0x3f) << 6)
239 | ((RTUNICP)(puch[1] & 0x3f) << 12)
240 | ((RTUNICP)(uch & 0x07) << 18);
241 puch += 4;
242 cch -= 4;
243 }
244 else if (!(uch & RT_BIT(2)))
245 {
246 *pCp++ = (puch[4] & 0x3f)
247 | ((RTUNICP)(puch[3] & 0x3f) << 6)
248 | ((RTUNICP)(puch[2] & 0x3f) << 12)
249 | ((RTUNICP)(puch[1] & 0x3f) << 18)
250 | ((RTUNICP)(uch & 0x03) << 24);
251 puch += 5;
252 cch -= 6;
253 }
254 else
255 {
256 Assert(!(uch & RT_BIT(1)));
257 *pCp++ = (puch[5] & 0x3f)
258 | ((RTUNICP)(puch[4] & 0x3f) << 6)
259 | ((RTUNICP)(puch[3] & 0x3f) << 12)
260 | ((RTUNICP)(puch[2] & 0x3f) << 18)
261 | ((RTUNICP)(puch[1] & 0x3f) << 24)
262 | ((RTUNICP)(uch & 0x01) << 30);
263 puch += 6;
264 cch -= 6;
265 }
266 }
267
268 /* done */
269 *pCp = 0;
270 return rc;
271}
272
273
274RTDECL(size_t) RTStrUniLen(const char *psz)
275{
276 size_t cCodePoints;
277 int rc = rtUtf8Length(psz, RTSTR_MAX, &cCodePoints, NULL);
278 return RT_SUCCESS(rc) ? cCodePoints : 0;
279}
280RT_EXPORT_SYMBOL(RTStrUniLen);
281
282
283RTDECL(int) RTStrUniLenEx(const char *psz, size_t cch, size_t *pcCps)
284{
285 size_t cCodePoints;
286 int rc = rtUtf8Length(psz, cch, &cCodePoints, NULL);
287 if (pcCps)
288 *pcCps = RT_SUCCESS(rc) ? cCodePoints : 0;
289 return rc;
290}
291RT_EXPORT_SYMBOL(RTStrUniLenEx);
292
293
294RTDECL(int) RTStrValidateEncoding(const char *psz)
295{
296 return RTStrValidateEncodingEx(psz, RTSTR_MAX, 0);
297}
298RT_EXPORT_SYMBOL(RTStrValidateEncoding);
299
300
301RTDECL(int) RTStrValidateEncodingEx(const char *psz, size_t cch, uint32_t fFlags)
302{
303 AssertReturn(!(fFlags & ~(RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)), VERR_INVALID_PARAMETER);
304 AssertPtr(psz);
305
306 /*
307 * Use rtUtf8Length for the job.
308 */
309 size_t cchActual;
310 size_t cCpsIgnored;
311 int rc = rtUtf8Length(psz, cch, &cCpsIgnored, &cchActual);
312 if (RT_SUCCESS(rc))
313 {
314 if ( (fFlags & RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)
315 && cchActual >= cch)
316 rc = VERR_BUFFER_OVERFLOW;
317 }
318 return rc;
319}
320RT_EXPORT_SYMBOL(RTStrValidateEncodingEx);
321
322
323RTDECL(bool) RTStrIsValidEncoding(const char *psz)
324{
325 int rc = RTStrValidateEncodingEx(psz, RTSTR_MAX, 0);
326 return RT_SUCCESS(rc);
327}
328RT_EXPORT_SYMBOL(RTStrIsValidEncoding);
329
330
331RTDECL(size_t) RTStrPurgeEncoding(char *psz)
332{
333 size_t cErrors = 0;
334 for (;;)
335 {
336 RTUNICP Cp;
337 int rc = RTStrGetCpEx((const char **)&psz, &Cp);
338 if (RT_SUCCESS(rc))
339 {
340 if (!Cp)
341 break;
342 }
343 else
344 {
345 psz[-1] = '?';
346 cErrors++;
347 }
348 }
349 return cErrors;
350}
351RT_EXPORT_SYMBOL(RTStrPurgeEncoding);
352
353
354RTDECL(int) RTStrToUni(const char *pszString, PRTUNICP *ppaCps)
355{
356 /*
357 * Validate input.
358 */
359 Assert(VALID_PTR(pszString));
360 Assert(VALID_PTR(ppaCps));
361 *ppaCps = NULL;
362
363 /*
364 * Validate the UTF-8 input and count its code points.
365 */
366 size_t cCps;
367 int rc = rtUtf8Length(pszString, RTSTR_MAX, &cCps, NULL);
368 if (RT_SUCCESS(rc))
369 {
370 /*
371 * Allocate buffer.
372 */
373 PRTUNICP paCps = (PRTUNICP)RTMemAlloc((cCps + 1) * sizeof(RTUNICP));
374 if (paCps)
375 {
376 /*
377 * Decode the string.
378 */
379 rc = rtUtf8Decode(pszString, RTSTR_MAX, paCps, cCps);
380 if (RT_SUCCESS(rc))
381 {
382 *ppaCps = paCps;
383 return rc;
384 }
385 RTMemFree(paCps);
386 }
387 else
388 rc = VERR_NO_CODE_POINT_MEMORY;
389 }
390 return rc;
391}
392RT_EXPORT_SYMBOL(RTStrToUni);
393
394
395RTDECL(int) RTStrToUniEx(const char *pszString, size_t cchString, PRTUNICP *ppaCps, size_t cCps, size_t *pcCps)
396{
397 /*
398 * Validate input.
399 */
400 Assert(VALID_PTR(pszString));
401 Assert(VALID_PTR(ppaCps));
402 Assert(!pcCps || VALID_PTR(pcCps));
403
404 /*
405 * Validate the UTF-8 input and count the code points.
406 */
407 size_t cCpsResult;
408 int rc = rtUtf8Length(pszString, cchString, &cCpsResult, NULL);
409 if (RT_SUCCESS(rc))
410 {
411 if (pcCps)
412 *pcCps = cCpsResult;
413
414 /*
415 * Check buffer size / Allocate buffer.
416 */
417 bool fShouldFree;
418 PRTUNICP paCpsResult;
419 if (cCps > 0 && *ppaCps)
420 {
421 fShouldFree = false;
422 if (cCps <= cCpsResult)
423 return VERR_BUFFER_OVERFLOW;
424 paCpsResult = *ppaCps;
425 }
426 else
427 {
428 *ppaCps = NULL;
429 fShouldFree = true;
430 cCps = RT_MAX(cCpsResult + 1, cCps);
431 paCpsResult = (PRTUNICP)RTMemAlloc(cCps * sizeof(RTUNICP));
432 }
433 if (paCpsResult)
434 {
435 /*
436 * Encode the UTF-16 string.
437 */
438 rc = rtUtf8Decode(pszString, cchString, paCpsResult, cCps - 1);
439 if (RT_SUCCESS(rc))
440 {
441 *ppaCps = paCpsResult;
442 return rc;
443 }
444 if (fShouldFree)
445 RTMemFree(paCpsResult);
446 }
447 else
448 rc = VERR_NO_CODE_POINT_MEMORY;
449 }
450 return rc;
451}
452RT_EXPORT_SYMBOL(RTStrToUniEx);
453
454
455/**
456 * Calculates the UTF-16 length of a string, validating the encoding while doing so.
457 *
458 * @returns IPRT status code.
459 * @param psz Pointer to the UTF-8 string.
460 * @param cch The max length of the string. (btw cch = cb)
461 * Use RTSTR_MAX if all of the string is to be examined.
462 * @param pcwc Where to store the length of the UTF-16 string as a number of RTUTF16 characters.
463 */
464static int rtUtf8CalcUtf16Length(const char *psz, size_t cch, size_t *pcwc)
465{
466 const unsigned char *puch = (const unsigned char *)psz;
467 size_t cwc = 0;
468 while (cch > 0)
469 {
470 const unsigned char uch = *puch;
471 if (!uch)
472 break;
473 if (!(uch & RT_BIT(7)))
474 {
475 /* one ASCII byte */
476 cwc++;
477 puch++;
478 cch--;
479 }
480 else
481 {
482 /* figure sequence length and validate the first byte */
483 unsigned cb;
484 if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
485 cb = 2;
486 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))
487 cb = 3;
488 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)))
489 cb = 4;
490 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3)))
491 cb = 5;
492 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2) | RT_BIT(1))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3) | RT_BIT(2)))
493 cb = 6;
494 else
495 {
496 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
497 return VERR_INVALID_UTF8_ENCODING;
498 }
499
500 /* check length */
501 if (cb > cch)
502 {
503 RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
504 return VERR_INVALID_UTF8_ENCODING;
505 }
506
507 /* validate the rest */
508 switch (cb)
509 {
510 case 6:
511 RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
512 case 5:
513 RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
514 case 4:
515 RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
516 case 3:
517 RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
518 case 2:
519 RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) | RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
520 break;
521 }
522
523 /* validate the code point. */
524 RTUNICP uc;
525 switch (cb)
526 {
527 case 6:
528 uc = (puch[5] & 0x3f)
529 | ((RTUNICP)(puch[4] & 0x3f) << 6)
530 | ((RTUNICP)(puch[3] & 0x3f) << 12)
531 | ((RTUNICP)(puch[2] & 0x3f) << 18)
532 | ((RTUNICP)(puch[1] & 0x3f) << 24)
533 | ((RTUNICP)(uch & 0x01) << 30);
534 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
535 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
536 RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
537 return VERR_CANT_RECODE_AS_UTF16;
538 case 5:
539 uc = (puch[4] & 0x3f)
540 | ((RTUNICP)(puch[3] & 0x3f) << 6)
541 | ((RTUNICP)(puch[2] & 0x3f) << 12)
542 | ((RTUNICP)(puch[1] & 0x3f) << 18)
543 | ((RTUNICP)(uch & 0x03) << 24);
544 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
545 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
546 RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
547 return VERR_CANT_RECODE_AS_UTF16;
548 case 4:
549 uc = (puch[3] & 0x3f)
550 | ((RTUNICP)(puch[2] & 0x3f) << 6)
551 | ((RTUNICP)(puch[1] & 0x3f) << 12)
552 | ((RTUNICP)(uch & 0x07) << 18);
553 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
554 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
555 RTStrAssertMsgReturn(uc <= 0x0010ffff,
556 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CANT_RECODE_AS_UTF16);
557 cwc++;
558 break;
559 case 3:
560 uc = (puch[2] & 0x3f)
561 | ((RTUNICP)(puch[1] & 0x3f) << 6)
562 | ((RTUNICP)(uch & 0x0f) << 12);
563 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
564 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
565 uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
566 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
567 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
568 break;
569 case 2:
570 uc = (puch[1] & 0x3f)
571 | ((RTUNICP)(uch & 0x1f) << 6);
572 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
573 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
574 break;
575 }
576
577 /* advance */
578 cch -= cb;
579 puch += cb;
580 cwc++;
581 }
582 }
583
584 /* done */
585 *pcwc = cwc;
586 return VINF_SUCCESS;
587}
588
589
590/**
591 * Recodes a valid UTF-8 string as UTF-16.
592 *
593 * Since we know the input is valid, we do *not* perform encoding or length checks.
594 *
595 * @returns iprt status code.
596 * @param psz The UTF-8 string to recode. This is a valid encoding.
597 * @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
598 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
599 * @param pwsz Where to store the UTF-16 string.
600 * @param cwc The number of RTUTF16 items the pwsz buffer can hold, excluding the terminator ('\\0').
601 */
602static int rtUtf8RecodeAsUtf16(const char *psz, size_t cch, PRTUTF16 pwsz, size_t cwc)
603{
604 int rc = VINF_SUCCESS;
605 const unsigned char *puch = (const unsigned char *)psz;
606 PRTUTF16 pwc = pwsz;
607 while (cch > 0)
608 {
609 /* read the next char and check for terminator. */
610 const unsigned char uch = *puch;
611 if (!uch)
612 break;
613
614 /* check for output overflow */
615 if (RT_UNLIKELY(cwc < 1))
616 {
617 rc = VERR_BUFFER_OVERFLOW;
618 break;
619 }
620 cwc--;
621
622 /* decode and recode the code point */
623 if (!(uch & RT_BIT(7)))
624 {
625 *pwc++ = uch;
626 puch++;
627 cch--;
628 }
629 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
630 {
631 uint16_t uc = (puch[1] & 0x3f)
632 | ((uint16_t)(uch & 0x1f) << 6);
633 *pwc++ = uc;
634 puch += 2;
635 cch -= 2;
636 }
637 else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5)))
638 {
639 uint16_t uc = (puch[2] & 0x3f)
640 | ((uint16_t)(puch[1] & 0x3f) << 6)
641 | ((uint16_t)(uch & 0x0f) << 12);
642 *pwc++ = uc;
643 puch += 3;
644 cch -= 3;
645 }
646 else
647 {
648 /* generate surrugate pair */
649 Assert((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4) | RT_BIT(3))) == (RT_BIT(7) | RT_BIT(6) | RT_BIT(5) | RT_BIT(4)));
650 RTUNICP uc = (puch[3] & 0x3f)
651 | ((RTUNICP)(puch[2] & 0x3f) << 6)
652 | ((RTUNICP)(puch[1] & 0x3f) << 12)
653 | ((RTUNICP)(uch & 0x07) << 18);
654 if (RT_UNLIKELY(cwc < 1))
655 {
656 rc = VERR_BUFFER_OVERFLOW;
657 break;
658 }
659 cwc--;
660
661 uc -= 0x10000;
662 *pwc++ = 0xd800 | (uc >> 10);
663 *pwc++ = 0xdc00 | (uc & 0x3ff);
664 puch += 4;
665 cch -= 4;
666 }
667 }
668
669 /* done */
670 *pwc = '\0';
671 return rc;
672}
673
674
675RTDECL(int) RTStrToUtf16Tag(const char *pszString, PRTUTF16 *ppwszString, const char *pszTag)
676{
677 /*
678 * Validate input.
679 */
680 Assert(VALID_PTR(ppwszString));
681 Assert(VALID_PTR(pszString));
682 *ppwszString = NULL;
683
684 /*
685 * Validate the UTF-8 input and calculate the length of the UTF-16 string.
686 */
687 size_t cwc;
688 int rc = rtUtf8CalcUtf16Length(pszString, RTSTR_MAX, &cwc);
689 if (RT_SUCCESS(rc))
690 {
691 /*
692 * Allocate buffer.
693 */
694 PRTUTF16 pwsz = (PRTUTF16)RTMemAllocTag((cwc + 1) * sizeof(RTUTF16), pszTag);
695 if (pwsz)
696 {
697 /*
698 * Encode the UTF-16 string.
699 */
700 rc = rtUtf8RecodeAsUtf16(pszString, RTSTR_MAX, pwsz, cwc);
701 if (RT_SUCCESS(rc))
702 {
703 *ppwszString = pwsz;
704 return rc;
705 }
706 RTMemFree(pwsz);
707 }
708 else
709 rc = VERR_NO_UTF16_MEMORY;
710 }
711 return rc;
712}
713RT_EXPORT_SYMBOL(RTStrToUtf16Tag);
714
715
716RTDECL(int) RTStrToUtf16ExTag(const char *pszString, size_t cchString,
717 PRTUTF16 *ppwsz, size_t cwc, size_t *pcwc, const char *pszTag)
718{
719 /*
720 * Validate input.
721 */
722 Assert(VALID_PTR(pszString));
723 Assert(VALID_PTR(ppwsz));
724 Assert(!pcwc || VALID_PTR(pcwc));
725
726 /*
727 * Validate the UTF-8 input and calculate the length of the UTF-16 string.
728 */
729 size_t cwcResult;
730 int rc = rtUtf8CalcUtf16Length(pszString, cchString, &cwcResult);
731 if (RT_SUCCESS(rc))
732 {
733 if (pcwc)
734 *pcwc = cwcResult;
735
736 /*
737 * Check buffer size / Allocate buffer.
738 */
739 bool fShouldFree;
740 PRTUTF16 pwszResult;
741 if (cwc > 0 && *ppwsz)
742 {
743 fShouldFree = false;
744 if (cwc <= cwcResult)
745 return VERR_BUFFER_OVERFLOW;
746 pwszResult = *ppwsz;
747 }
748 else
749 {
750 *ppwsz = NULL;
751 fShouldFree = true;
752 cwc = RT_MAX(cwcResult + 1, cwc);
753 pwszResult = (PRTUTF16)RTMemAllocTag(cwc * sizeof(RTUTF16), pszTag);
754 }
755 if (pwszResult)
756 {
757 /*
758 * Encode the UTF-16 string.
759 */
760 rc = rtUtf8RecodeAsUtf16(pszString, cchString, pwszResult, cwc - 1);
761 if (RT_SUCCESS(rc))
762 {
763 *ppwsz = pwszResult;
764 return rc;
765 }
766 if (fShouldFree)
767 RTMemFree(pwszResult);
768 }
769 else
770 rc = VERR_NO_UTF16_MEMORY;
771 }
772 return rc;
773}
774RT_EXPORT_SYMBOL(RTStrToUtf16ExTag);
775
776
777RTDECL(size_t) RTStrCalcUtf16Len(const char *psz)
778{
779 size_t cwc;
780 int rc = rtUtf8CalcUtf16Length(psz, RTSTR_MAX, &cwc);
781 return RT_SUCCESS(rc) ? cwc : 0;
782}
783RT_EXPORT_SYMBOL(RTStrCalcUtf16Len);
784
785
786RTDECL(int) RTStrCalcUtf16LenEx(const char *psz, size_t cch, size_t *pcwc)
787{
788 size_t cwc;
789 int rc = rtUtf8CalcUtf16Length(psz, cch, &cwc);
790 if (pcwc)
791 *pcwc = RT_SUCCESS(rc) ? cwc : ~(size_t)0;
792 return rc;
793}
794RT_EXPORT_SYMBOL(RTStrCalcUtf16LenEx);
795
796
797/**
798 * Calculates the length of the UTF-8 encoding of a Latin-1 string.
799 *
800 * @returns iprt status code.
801 * @param psz The Latin-1 string.
802 * @param cchIn The max length of the Latin-1 string to consider.
803 * @param pcch Where to store the length (excluding '\\0') of the UTF-8 string. (cch == cb, btw)
804 */
805static int rtLatin1CalcUtf8Length(const char *psz, size_t cchIn, size_t *pcch)
806{
807 size_t cch = 0;
808 for (;;)
809 {
810 RTUNICP Cp;
811 size_t cchCp;
812 int rc = RTLatin1GetCpNEx(&psz, &cchIn, &Cp);
813 if (Cp == 0 || rc == VERR_END_OF_STRING)
814 break;
815 if (RT_FAILURE(rc))
816 return rc;
817 cch += RTStrCpSize(Cp); /* cannot fail */
818 }
819
820 /* done */
821 *pcch = cch;
822 return VINF_SUCCESS;
823}
824
825
826/**
827 * Recodes a Latin-1 string as UTF-8.
828 *
829 * @returns iprt status code.
830 * @param psz The Latin-1 string.
831 * @param cchIn The number of characters to process from psz. The recoding
832 * will stop when cch or '\\0' is reached.
833 * @param psz Where to store the UTF-8 string.
834 * @param cch The size of the UTF-8 buffer, excluding the terminator.
835 */
836static int rtLatin1RecodeAsUtf8(const char *pszIn, size_t cchIn, char *psz, size_t cch)
837{
838 int rc = VINF_SUCCESS;
839 for (;;)
840 {
841 RTUNICP Cp;
842 size_t cchCp;
843 rc = RTLatin1GetCpNEx(&pszIn, &cchIn, &Cp);
844 if (Cp == 0 || RT_FAILURE(rc))
845 break;
846 cchCp = RTStrCpSize(Cp);
847 if (RT_UNLIKELY(cch < cchCp))
848 {
849 RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
850 rc = VERR_BUFFER_OVERFLOW;
851 break;
852 }
853 cch -= cchCp;
854 psz = RTStrPutCp(psz, Cp);
855 }
856
857 /* done */
858 if (rc == VERR_END_OF_STRING)
859 rc = VINF_SUCCESS;
860 *psz = '\0';
861 return rc;
862}
863
864
865
866RTDECL(int) RTLatin1ToUtf8Tag(const char *pszString, char **ppszString, const char *pszTag)
867{
868 /*
869 * Validate input.
870 */
871 Assert(VALID_PTR(ppszString));
872 Assert(VALID_PTR(pszString));
873 *ppszString = NULL;
874
875 /*
876 * Calculate the length of the UTF-8 encoding of the Latin-1 string.
877 */
878 size_t cch;
879 int rc = rtLatin1CalcUtf8Length(pszString, RTSTR_MAX, &cch);
880 if (RT_SUCCESS(rc))
881 {
882 /*
883 * Allocate buffer and recode it.
884 */
885 char *pszResult = (char *)RTMemAllocTag(cch + 1, pszTag);
886 if (pszResult)
887 {
888 rc = rtLatin1RecodeAsUtf8(pszString, RTSTR_MAX, pszResult, cch);
889 if (RT_SUCCESS(rc))
890 {
891 *ppszString = pszResult;
892 return rc;
893 }
894
895 RTMemFree(pszResult);
896 }
897 else
898 rc = VERR_NO_STR_MEMORY;
899 }
900 return rc;
901}
902RT_EXPORT_SYMBOL(RTLatin1ToUtf8Tag);
903
904
905RTDECL(int) RTLatin1ToUtf8ExTag(const char *pszString, size_t cchString, char **ppsz, size_t cch, size_t *pcch, const char *pszTag)
906{
907 /*
908 * Validate input.
909 */
910 Assert(VALID_PTR(pszString));
911 Assert(VALID_PTR(ppsz));
912 Assert(!pcch || VALID_PTR(pcch));
913
914 /*
915 * Calculate the length of the UTF-8 encoding of the Latin-1 string.
916 */
917 size_t cchResult;
918 int rc = rtLatin1CalcUtf8Length(pszString, cchString, &cchResult);
919 if (RT_SUCCESS(rc))
920 {
921 if (pcch)
922 *pcch = cchResult;
923
924 /*
925 * Check buffer size / Allocate buffer and recode it.
926 */
927 bool fShouldFree;
928 char *pszResult;
929 if (cch > 0 && *ppsz)
930 {
931 fShouldFree = false;
932 if (RT_UNLIKELY(cch <= cchResult))
933 return VERR_BUFFER_OVERFLOW;
934 pszResult = *ppsz;
935 }
936 else
937 {
938 *ppsz = NULL;
939 fShouldFree = true;
940 cch = RT_MAX(cch, cchResult + 1);
941 pszResult = (char *)RTStrAllocTag(cch, pszTag);
942 }
943 if (pszResult)
944 {
945 rc = rtLatin1RecodeAsUtf8(pszString, cchString, pszResult, cch - 1);
946 if (RT_SUCCESS(rc))
947 {
948 *ppsz = pszResult;
949 return rc;
950 }
951
952 if (fShouldFree)
953 RTStrFree(pszResult);
954 }
955 else
956 rc = VERR_NO_STR_MEMORY;
957 }
958 return rc;
959}
960RT_EXPORT_SYMBOL(RTLatin1ToUtf8ExTag);
961
962
963RTDECL(size_t) RTLatin1CalcUtf8Len(const char *psz)
964{
965 size_t cch;
966 int rc = rtLatin1CalcUtf8Length(psz, RTSTR_MAX, &cch);
967 return RT_SUCCESS(rc) ? cch : 0;
968}
969RT_EXPORT_SYMBOL(RTLatin1CalcUtf8Len);
970
971
972RTDECL(int) RTLatin1CalcUtf8LenEx(const char *psz, size_t cchIn, size_t *pcch)
973{
974 size_t cch;
975 int rc = rtLatin1CalcUtf8Length(psz, cchIn, &cch);
976 if (pcch)
977 *pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
978 return rc;
979}
980RT_EXPORT_SYMBOL(RTLatin1CalcUtf8LenEx);
981
982
983/**
984 * Calculates the Latin-1 length of a string, validating the encoding while
985 * doing so.
986 *
987 * @returns IPRT status code.
988 * @param psz Pointer to the UTF-8 string.
989 * @param cchIn The max length of the string. (btw cch = cb)
990 * Use RTSTR_MAX if all of the string is to be examined.
991 * @param pcch Where to store the length of the Latin-1 string in bytes.
992 */
993static int rtUtf8CalcLatin1Length(const char *psz, size_t cchIn, size_t *pcch)
994{
995 size_t cch = 0;
996 for (;;)
997 {
998 RTUNICP Cp;
999 size_t cchCp;
1000 int rc = RTStrGetCpNEx(&psz, &cchIn, &Cp);
1001 if (Cp == 0 || rc == VERR_END_OF_STRING)
1002 break;
1003 if (RT_FAILURE(rc))
1004 return rc;
1005 cchCp = RTLatin1CpSize(Cp);
1006 if (cchCp == 0)
1007 return VERR_NO_TRANSLATION;
1008 cch += cchCp;
1009 }
1010
1011 /* done */
1012 *pcch = cch;
1013 return VINF_SUCCESS;
1014}
1015
1016
1017/**
1018 * Recodes a valid UTF-8 string as Latin-1.
1019 *
1020 * Since we know the input is valid, we do *not* perform encoding or length checks.
1021 *
1022 * @returns iprt status code.
1023 * @param pszIn The UTF-8 string to recode. This is a valid encoding.
1024 * @param cchIn The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
1025 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
1026 * @param psz Where to store the Latin-1 string.
1027 * @param cch The number of characters the pszOut buffer can hold, excluding the terminator ('\\0').
1028 */
1029static int rtUtf8RecodeAsLatin1(const char *pszIn, size_t cchIn, char *psz, size_t cch)
1030{
1031 int rc = VINF_SUCCESS;
1032
1033 for (;;)
1034 {
1035 RTUNICP Cp;
1036 size_t cchCp;
1037 rc = RTStrGetCpNEx(&pszIn, &cchIn, &Cp);
1038 if (Cp == 0 || RT_FAILURE(rc))
1039 break;
1040 cchCp = RTLatin1CpSize(Cp);
1041 if (RT_UNLIKELY(cch < cchCp))
1042 {
1043 RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
1044 rc = VERR_BUFFER_OVERFLOW;
1045 break;
1046 }
1047 cch -= cchCp;
1048 psz = RTLatin1PutCp(psz, Cp);
1049 }
1050
1051 /* done */
1052 if (rc == VERR_END_OF_STRING)
1053 rc = VINF_SUCCESS;
1054 *psz = '\0';
1055 return rc;
1056}
1057
1058
1059
1060RTDECL(int) RTStrToLatin1Tag(const char *pszString, char **ppszString, const char *pszTag)
1061{
1062 /*
1063 * Validate input.
1064 */
1065 Assert(VALID_PTR(ppszString));
1066 Assert(VALID_PTR(pszString));
1067 *ppszString = NULL;
1068
1069 /*
1070 * Validate the UTF-8 input and calculate the length of the Latin-1 string.
1071 */
1072 size_t cch;
1073 int rc = rtUtf8CalcLatin1Length(pszString, RTSTR_MAX, &cch);
1074 if (RT_SUCCESS(rc))
1075 {
1076 /*
1077 * Allocate buffer.
1078 */
1079 char *psz = (char *)RTMemAllocTag(cch + 1, pszTag);
1080 if (psz)
1081 {
1082 /*
1083 * Encode the UTF-16 string.
1084 */
1085 rc = rtUtf8RecodeAsLatin1(pszString, RTSTR_MAX, psz, cch);
1086 if (RT_SUCCESS(rc))
1087 {
1088 *ppszString = psz;
1089 return rc;
1090 }
1091 RTMemFree(psz);
1092 }
1093 else
1094 rc = VERR_NO_STR_MEMORY;
1095 }
1096 return rc;
1097}
1098RT_EXPORT_SYMBOL(RTStrToLatin1Tag);
1099
1100
1101RTDECL(int) RTStrToLatin1ExTag(const char *pszString, size_t cchString,
1102 char **ppsz, size_t cch, size_t *pcch, const char *pszTag)
1103{
1104 /*
1105 * Validate input.
1106 */
1107 Assert(VALID_PTR(pszString));
1108 Assert(VALID_PTR(ppsz));
1109 Assert(!pcch || VALID_PTR(pcch));
1110
1111 /*
1112 * Validate the UTF-8 input and calculate the length of the UTF-16 string.
1113 */
1114 size_t cchResult;
1115 int rc = rtUtf8CalcLatin1Length(pszString, cchString, &cchResult);
1116 if (RT_SUCCESS(rc))
1117 {
1118 if (pcch)
1119 *pcch = cchResult;
1120
1121 /*
1122 * Check buffer size / Allocate buffer.
1123 */
1124 bool fShouldFree;
1125 char *pszResult;
1126 if (cch > 0 && *ppsz)
1127 {
1128 fShouldFree = false;
1129 if (cch <= cchResult)
1130 return VERR_BUFFER_OVERFLOW;
1131 pszResult = *ppsz;
1132 }
1133 else
1134 {
1135 *ppsz = NULL;
1136 fShouldFree = true;
1137 cch = RT_MAX(cchResult + 1, cch);
1138 pszResult = (char *)RTMemAllocTag(cch, pszTag);
1139 }
1140 if (pszResult)
1141 {
1142 /*
1143 * Encode the Latin-1 string.
1144 */
1145 rc = rtUtf8RecodeAsLatin1(pszString, cchString, pszResult, cch - 1);
1146 if (RT_SUCCESS(rc))
1147 {
1148 *ppsz = pszResult;
1149 return rc;
1150 }
1151 if (fShouldFree)
1152 RTMemFree(pszResult);
1153 }
1154 else
1155 rc = VERR_NO_STR_MEMORY;
1156 }
1157 return rc;
1158}
1159RT_EXPORT_SYMBOL(RTStrToLatin1Tag);
1160
1161
1162RTDECL(size_t) RTStrCalcLatin1Len(const char *psz)
1163{
1164 size_t cch;
1165 int rc = rtUtf8CalcLatin1Length(psz, RTSTR_MAX, &cch);
1166 return RT_SUCCESS(rc) ? cch : 0;
1167}
1168RT_EXPORT_SYMBOL(RTStrCalcLatin1Len);
1169
1170
1171RTDECL(int) RTStrCalcLatin1LenEx(const char *psz, size_t cchIn, size_t *pcch)
1172{
1173 size_t cch;
1174 int rc = rtUtf8CalcLatin1Length(psz, cchIn, &cch);
1175 if (pcch)
1176 *pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
1177 return rc;
1178}
1179RT_EXPORT_SYMBOL(RTStrCalcLatin1LenEx);
1180
1181
1182/**
1183 * Handle invalid encodings passed to RTStrGetCp() and RTStrGetCpEx().
1184 * @returns rc
1185 * @param ppsz The pointer to the string position point.
1186 * @param pCp Where to store RTUNICP_INVALID.
1187 * @param rc The iprt error code.
1188 */
1189static int rtStrGetCpExFailure(const char **ppsz, PRTUNICP pCp, int rc)
1190{
1191 /*
1192 * Try find a valid encoding.
1193 */
1194 (*ppsz)++; /** @todo code this! */
1195 *pCp = RTUNICP_INVALID;
1196 return rc;
1197}
1198
1199
1200RTDECL(RTUNICP) RTStrGetCpInternal(const char *psz)
1201{
1202 RTUNICP Cp;
1203 RTStrGetCpExInternal(&psz, &Cp);
1204 return Cp;
1205}
1206RT_EXPORT_SYMBOL(RTStrGetCpInternal);
1207
1208
1209RTDECL(int) RTStrGetCpExInternal(const char **ppsz, PRTUNICP pCp)
1210{
1211 const unsigned char *puch = (const unsigned char *)*ppsz;
1212 const unsigned char uch = *puch;
1213 RTUNICP uc;
1214
1215 /* ASCII ? */
1216 if (!(uch & RT_BIT(7)))
1217 {
1218 uc = uch;
1219 puch++;
1220 }
1221 else if (uch & RT_BIT(6))
1222 {
1223 /* figure the length and validate the first octet. */
1224/** @todo RT_USE_RTC_3629 */
1225 unsigned cb;
1226 if (!(uch & RT_BIT(5)))
1227 cb = 2;
1228 else if (!(uch & RT_BIT(4)))
1229 cb = 3;
1230 else if (!(uch & RT_BIT(3)))
1231 cb = 4;
1232 else if (!(uch & RT_BIT(2)))
1233 cb = 5;
1234 else if (!(uch & RT_BIT(1)))
1235 cb = 6;
1236 else
1237 {
1238 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
1239 return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
1240 }
1241
1242 /* validate the rest */
1243 switch (cb)
1244 {
1245 case 6:
1246 RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1247 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1248 case 5:
1249 RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1250 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1251 case 4:
1252 RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1253 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1254 case 3:
1255 RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1256 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1257 case 2:
1258 RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1259 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1260 break;
1261 }
1262
1263 /* get and validate the code point. */
1264 switch (cb)
1265 {
1266 case 6:
1267 uc = (puch[5] & 0x3f)
1268 | ((RTUNICP)(puch[4] & 0x3f) << 6)
1269 | ((RTUNICP)(puch[3] & 0x3f) << 12)
1270 | ((RTUNICP)(puch[2] & 0x3f) << 18)
1271 | ((RTUNICP)(puch[1] & 0x3f) << 24)
1272 | ((RTUNICP)(uch & 0x01) << 30);
1273 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
1274 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1275 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1276 break;
1277 case 5:
1278 uc = (puch[4] & 0x3f)
1279 | ((RTUNICP)(puch[3] & 0x3f) << 6)
1280 | ((RTUNICP)(puch[2] & 0x3f) << 12)
1281 | ((RTUNICP)(puch[1] & 0x3f) << 18)
1282 | ((RTUNICP)(uch & 0x03) << 24);
1283 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
1284 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1285 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1286 break;
1287 case 4:
1288 uc = (puch[3] & 0x3f)
1289 | ((RTUNICP)(puch[2] & 0x3f) << 6)
1290 | ((RTUNICP)(puch[1] & 0x3f) << 12)
1291 | ((RTUNICP)(uch & 0x07) << 18);
1292 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
1293 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1294 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1295 break;
1296 case 3:
1297 uc = (puch[2] & 0x3f)
1298 | ((RTUNICP)(puch[1] & 0x3f) << 6)
1299 | ((RTUNICP)(uch & 0x0f) << 12);
1300 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
1301 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1302 rtStrGetCpExFailure(ppsz, pCp, uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING));
1303 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
1304 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1305 rtStrGetCpExFailure(ppsz, pCp, VERR_CODE_POINT_SURROGATE));
1306 break;
1307 case 2:
1308 uc = (puch[1] & 0x3f)
1309 | ((RTUNICP)(uch & 0x1f) << 6);
1310 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
1311 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1312 rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1313 break;
1314 default: /* impossible, but GCC is bitching. */
1315 uc = RTUNICP_INVALID;
1316 break;
1317 }
1318 puch += cb;
1319 }
1320 else
1321 {
1322 /* 6th bit is always set. */
1323 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
1324 return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
1325 }
1326 *pCp = uc;
1327 *ppsz = (const char *)puch;
1328 return VINF_SUCCESS;
1329}
1330RT_EXPORT_SYMBOL(RTStrGetCpExInternal);
1331
1332
1333/**
1334 * Handle invalid encodings passed to RTStrGetCpNEx().
1335 * @returns rc
1336 * @param ppsz The pointer to the string position point.
1337 * @param pcch Pointer to the string length.
1338 * @param pCp Where to store RTUNICP_INVALID.
1339 * @param rc The iprt error code.
1340 */
1341static int rtStrGetCpNExFailure(const char **ppsz, size_t *pcch, PRTUNICP pCp, int rc)
1342{
1343 /*
1344 * Try find a valid encoding.
1345 */
1346 (*ppsz)++; /** @todo code this! */
1347 (*pcch)--;
1348 *pCp = RTUNICP_INVALID;
1349 return rc;
1350}
1351
1352
1353RTDECL(int) RTStrGetCpNExInternal(const char **ppsz, size_t *pcch, PRTUNICP pCp)
1354{
1355 const unsigned char *puch = (const unsigned char *)*ppsz;
1356 const unsigned char uch = *puch;
1357 size_t cch = *pcch;
1358 RTUNICP uc;
1359
1360 if (cch == 0)
1361 {
1362 *pCp = RTUNICP_INVALID;
1363 return VERR_END_OF_STRING;
1364 }
1365
1366 /* ASCII ? */
1367 if (!(uch & RT_BIT(7)))
1368 {
1369 uc = uch;
1370 puch++;
1371 cch--;
1372 }
1373 else if (uch & RT_BIT(6))
1374 {
1375 /* figure the length and validate the first octet. */
1376/** @todo RT_USE_RTC_3629 */
1377 unsigned cb;
1378 if (!(uch & RT_BIT(5)))
1379 cb = 2;
1380 else if (!(uch & RT_BIT(4)))
1381 cb = 3;
1382 else if (!(uch & RT_BIT(3)))
1383 cb = 4;
1384 else if (!(uch & RT_BIT(2)))
1385 cb = 5;
1386 else if (!(uch & RT_BIT(1)))
1387 cb = 6;
1388 else
1389 {
1390 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
1391 return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1392 }
1393
1394 if (cb > cch)
1395 return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1396
1397 /* validate the rest */
1398 switch (cb)
1399 {
1400 case 6:
1401 RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1402 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1403 case 5:
1404 RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1405 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1406 case 4:
1407 RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1408 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1409 case 3:
1410 RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1411 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1412 case 2:
1413 RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1414 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1415 break;
1416 }
1417
1418 /* get and validate the code point. */
1419 switch (cb)
1420 {
1421 case 6:
1422 uc = (puch[5] & 0x3f)
1423 | ((RTUNICP)(puch[4] & 0x3f) << 6)
1424 | ((RTUNICP)(puch[3] & 0x3f) << 12)
1425 | ((RTUNICP)(puch[2] & 0x3f) << 18)
1426 | ((RTUNICP)(puch[1] & 0x3f) << 24)
1427 | ((RTUNICP)(uch & 0x01) << 30);
1428 RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
1429 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1430 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1431 break;
1432 case 5:
1433 uc = (puch[4] & 0x3f)
1434 | ((RTUNICP)(puch[3] & 0x3f) << 6)
1435 | ((RTUNICP)(puch[2] & 0x3f) << 12)
1436 | ((RTUNICP)(puch[1] & 0x3f) << 18)
1437 | ((RTUNICP)(uch & 0x03) << 24);
1438 RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
1439 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1440 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1441 break;
1442 case 4:
1443 uc = (puch[3] & 0x3f)
1444 | ((RTUNICP)(puch[2] & 0x3f) << 6)
1445 | ((RTUNICP)(puch[1] & 0x3f) << 12)
1446 | ((RTUNICP)(uch & 0x07) << 18);
1447 RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
1448 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1449 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1450 break;
1451 case 3:
1452 uc = (puch[2] & 0x3f)
1453 | ((RTUNICP)(puch[1] & 0x3f) << 6)
1454 | ((RTUNICP)(uch & 0x0f) << 12);
1455 RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
1456 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1457 rtStrGetCpNExFailure(ppsz, pcch, pCp, uc == 0xffff || uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING));
1458 RTStrAssertMsgReturn(uc < 0xd800 || uc > 0xdfff,
1459 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1460 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_CODE_POINT_SURROGATE));
1461 break;
1462 case 2:
1463 uc = (puch[1] & 0x3f)
1464 | ((RTUNICP)(uch & 0x1f) << 6);
1465 RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
1466 ("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char *)puch)), puch),
1467 rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1468 break;
1469 default: /* impossible, but GCC is bitching. */
1470 uc = RTUNICP_INVALID;
1471 break;
1472 }
1473 puch += cb;
1474 cch -= cb;
1475 }
1476 else
1477 {
1478 /* 6th bit is always set. */
1479 RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(strlen((char *)puch), 10), puch));
1480 return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1481 }
1482 *pCp = uc;
1483 *ppsz = (const char *)puch;
1484 (*pcch) = cch;
1485 return VINF_SUCCESS;
1486}
1487RT_EXPORT_SYMBOL(RTStrGetCpNExInternal);
1488
1489
1490RTDECL(char *) RTStrPutCpInternal(char *psz, RTUNICP uc)
1491{
1492 unsigned char *puch = (unsigned char *)psz;
1493 if (uc < 0x80)
1494 *puch++ = (unsigned char )uc;
1495 else if (uc < 0x00000800)
1496 {
1497 *puch++ = 0xc0 | (uc >> 6);
1498 *puch++ = 0x80 | (uc & 0x3f);
1499 }
1500 else if (uc < 0x00010000)
1501 {
1502/** @todo RT_USE_RTC_3629 */
1503 if ( uc < 0x0000d8000
1504 || ( uc > 0x0000dfff
1505 && uc < 0x0000fffe))
1506 {
1507 *puch++ = 0xe0 | (uc >> 12);
1508 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
1509 *puch++ = 0x80 | (uc & 0x3f);
1510 }
1511 else
1512 {
1513 AssertMsgFailed(("Invalid code point U+%05x!\n", uc));
1514 *puch++ = 0x7f;
1515 }
1516 }
1517/** @todo RT_USE_RTC_3629 */
1518 else if (uc < 0x00200000)
1519 {
1520 *puch++ = 0xf0 | (uc >> 18);
1521 *puch++ = 0x80 | ((uc >> 12) & 0x3f);
1522 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
1523 *puch++ = 0x80 | (uc & 0x3f);
1524 }
1525 else if (uc < 0x04000000)
1526 {
1527 *puch++ = 0xf8 | (uc >> 24);
1528 *puch++ = 0x80 | ((uc >> 18) & 0x3f);
1529 *puch++ = 0x80 | ((uc >> 12) & 0x3f);
1530 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
1531 *puch++ = 0x80 | (uc & 0x3f);
1532 }
1533 else if (uc <= 0x7fffffff)
1534 {
1535 *puch++ = 0xfc | (uc >> 30);
1536 *puch++ = 0x80 | ((uc >> 24) & 0x3f);
1537 *puch++ = 0x80 | ((uc >> 18) & 0x3f);
1538 *puch++ = 0x80 | ((uc >> 12) & 0x3f);
1539 *puch++ = 0x80 | ((uc >> 6) & 0x3f);
1540 *puch++ = 0x80 | (uc & 0x3f);
1541 }
1542 else
1543 {
1544 AssertMsgFailed(("Invalid code point U+%08x!\n", uc));
1545 *puch++ = 0x7f;
1546 }
1547
1548 return (char *)puch;
1549}
1550RT_EXPORT_SYMBOL(RTStrPutCpInternal);
1551
1552
1553RTDECL(char *) RTStrPrevCp(const char *pszStart, const char *psz)
1554{
1555 if (pszStart < psz)
1556 {
1557 /* simple char? */
1558 const unsigned char *puch = (const unsigned char *)psz;
1559 unsigned uch = *--puch;
1560 if (!(uch & RT_BIT(7)))
1561 return (char *)puch;
1562 RTStrAssertMsgReturn(!(uch & RT_BIT(6)), ("uch=%#x\n", uch), (char *)pszStart);
1563
1564 /* two or more. */
1565 uint32_t uMask = 0xffffffc0;
1566 while ( (const unsigned char *)pszStart < puch
1567 && !(uMask & 1))
1568 {
1569 uch = *--puch;
1570 if ((uch & 0xc0) != 0x80)
1571 {
1572 RTStrAssertMsgReturn((uch & (uMask >> 1)) == (uMask & 0xff),
1573 ("Invalid UTF-8 encoding: %.*Rhxs puch=%p psz=%p\n", psz - (char *)puch, puch, psz),
1574 (char *)pszStart);
1575 return (char *)puch;
1576 }
1577 uMask >>= 1;
1578 }
1579 RTStrAssertMsgFailed(("Invalid UTF-8 encoding: %.*Rhxs puch=%p psz=%p\n", psz - (char *)puch, puch, psz));
1580 }
1581 return (char *)pszStart;
1582}
1583RT_EXPORT_SYMBOL(RTStrPrevCp);
1584
Note: See TracBrowser for help on using the repository browser.

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette