VirtualBox

source: vbox/trunk/src/VBox/Runtime/common/string/utf-16.cpp@ 21714

Last change on this file since 21714 was 21714, checked in by vboxsync, 16 years ago

iprt/string: conversion between Utf16 and Latin1

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 30.3 KB
Line 
1/* $Id: utf-16.cpp 21714 2009-07-17 23:22:40Z vboxsync $ */
2/** @file
3 * IPRT - UTF-16.
4 */
5
6/*
7 * Copyright (C) 2006-2007 Sun Microsystems, Inc.
8 *
9 * This file is part of VirtualBox Open Source Edition (OSE), as
10 * available from http://www.215389.xyz. This file is free software;
11 * you can redistribute it and/or modify it under the terms of the GNU
12 * General Public License (GPL) as published by the Free Software
13 * Foundation, in version 2 as it comes in the "COPYING" file of the
14 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16 *
17 * The contents of this file may alternatively be used under the terms
18 * of the Common Development and Distribution License Version 1.0
19 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20 * VirtualBox OSE distribution, in which case the provisions of the
21 * CDDL are applicable instead of those of the GPL.
22 *
23 * You may elect to license modified versions of this file under the
24 * terms and conditions of either the GPL or the CDDL or both.
25 *
26 * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa
27 * Clara, CA 95054 USA or visit http://www.sun.com if you need
28 * additional information or have any questions.
29 */
30
31
32/*******************************************************************************
33* Header Files *
34*******************************************************************************/
35#include <iprt/string.h>
36#include "internal/iprt.h"
37
38#include <iprt/uni.h>
39#include <iprt/alloc.h>
40#include <iprt/assert.h>
41#include <iprt/err.h>
42#include "internal/string.h"
43
44
45
46RTDECL(void) RTUtf16Free(PRTUTF16 pwszString)
47{
48 if (pwszString)
49 RTMemTmpFree(pwszString);
50}
51RT_EXPORT_SYMBOL(RTUtf16Free);
52
53
54RTDECL(PRTUTF16) RTUtf16Dup(PCRTUTF16 pwszString)
55{
56 Assert(pwszString);
57 size_t cb = (RTUtf16Len(pwszString) + 1) * sizeof(RTUTF16);
58 PRTUTF16 pwsz = (PRTUTF16)RTMemAlloc(cb);
59 if (pwsz)
60 memcpy(pwsz, pwszString, cb);
61 return pwsz;
62}
63RT_EXPORT_SYMBOL(RTUtf16Dup);
64
65
66RTDECL(int) RTUtf16DupEx(PRTUTF16 *ppwszString, PCRTUTF16 pwszString, size_t cwcExtra)
67{
68 Assert(pwszString);
69 size_t cb = (RTUtf16Len(pwszString) + 1) * sizeof(RTUTF16);
70 PRTUTF16 pwsz = (PRTUTF16)RTMemAlloc(cb + cwcExtra * sizeof(RTUTF16));
71 if (pwsz)
72 {
73 memcpy(pwsz, pwszString, cb);
74 *ppwszString = pwsz;
75 return VINF_SUCCESS;
76 }
77 return VERR_NO_MEMORY;
78}
79RT_EXPORT_SYMBOL(RTUtf16DupEx);
80
81
82RTDECL(size_t) RTUtf16Len(PCRTUTF16 pwszString)
83{
84 if (!pwszString)
85 return 0;
86
87 PCRTUTF16 pwsz = pwszString;
88 while (*pwsz)
89 pwsz++;
90 return pwsz - pwszString;
91}
92RT_EXPORT_SYMBOL(RTUtf16Len);
93
94
95RTDECL(int) RTUtf16Cmp(register PCRTUTF16 pwsz1, register PCRTUTF16 pwsz2)
96{
97 if (pwsz1 == pwsz2)
98 return 0;
99 if (!pwsz1)
100 return -1;
101 if (!pwsz2)
102 return 1;
103
104 for (;;)
105 {
106 register RTUTF16 wcs = *pwsz1;
107 register int iDiff = wcs - *pwsz2;
108 if (iDiff || !wcs)
109 return iDiff;
110 pwsz1++;
111 pwsz2++;
112 }
113}
114RT_EXPORT_SYMBOL(RTUtf16Cmp);
115
116
117RTDECL(int) RTUtf16ICmp(register PCRTUTF16 pwsz1, register PCRTUTF16 pwsz2)
118{
119 if (pwsz1 == pwsz2)
120 return 0;
121 if (!pwsz1)
122 return -1;
123 if (!pwsz2)
124 return 1;
125
126 PCRTUTF16 pwsz1Start = pwsz1; /* keep it around in case we have to backtrack on a surrogate pair */
127 for (;;)
128 {
129 register RTUTF16 wc1 = *pwsz1;
130 register RTUTF16 wc2 = *pwsz2;
131 register int iDiff = wc1 - wc2;
132 if (iDiff)
133 {
134 /* unless they are *both* surrogate pairs, there is no chance they'll be identical. */
135 if ( wc1 < 0xd800
136 || wc2 < 0xd800
137 || wc1 > 0xdfff
138 || wc2 > 0xdfff)
139 {
140 /* simple UCS-2 char */
141 iDiff = RTUniCpToUpper(wc1) - RTUniCpToUpper(wc2);
142 if (iDiff)
143 iDiff = RTUniCpToLower(wc1) - RTUniCpToLower(wc2);
144 }
145 else
146 {
147 /* a damned pair */
148 RTUNICP uc1;
149 RTUNICP uc2;
150 if (wc1 >= 0xdc00)
151 {
152 if (pwsz1Start == pwsz1)
153 return iDiff;
154 uc1 = pwsz1[-1];
155 if (uc1 < 0xd800 || uc1 >= 0xdc00)
156 return iDiff;
157 uc1 = 0x10000 + (((uc1 & 0x3ff) << 10) | (wc1 & 0x3ff));
158 uc2 = 0x10000 + (((pwsz2[-1] & 0x3ff) << 10) | (wc2 & 0x3ff));
159 }
160 else
161 {
162 uc1 = *++pwsz1;
163 if (uc1 < 0xdc00 || uc1 >= 0xe000)
164 return iDiff;
165 uc1 = 0x10000 + (((wc1 & 0x3ff) << 10) | (uc1 & 0x3ff));
166 uc2 = 0x10000 + (((wc2 & 0x3ff) << 10) | (*++pwsz2 & 0x3ff));
167 }
168 iDiff = RTUniCpToUpper(uc1) - RTUniCpToUpper(uc2);
169 if (iDiff)
170 iDiff = RTUniCpToLower(uc1) - RTUniCpToLower(uc2); /* serious paranoia! */
171 }
172 if (iDiff)
173 return iDiff;
174 }
175 if (!wc1)
176 return 0;
177 pwsz1++;
178 pwsz2++;
179 }
180}
181RT_EXPORT_SYMBOL(RTUtf16ICmp);
182
183
184RTDECL(PRTUTF16) RTUtf16ToLower(PRTUTF16 pwsz)
185{
186 PRTUTF16 pwc = pwsz;
187 for (;;)
188 {
189 RTUTF16 wc = *pwc;
190 if (!wc)
191 break;
192 if (wc < 0xd800 || wc >= 0xdc00)
193 {
194 RTUNICP ucFolded = RTUniCpToLower(wc);
195 if (ucFolded < 0x10000)
196 *pwc++ = RTUniCpToLower(wc);
197 }
198 else
199 {
200 /* surrogate */
201 RTUTF16 wc2 = pwc[1];
202 if (wc2 >= 0xdc00 && wc2 <= 0xdfff)
203 {
204 RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff));
205 RTUNICP ucFolded = RTUniCpToLower(uc);
206 if (uc != ucFolded && ucFolded >= 0x10000) /* we don't support shrinking the string */
207 {
208 uc -= 0x10000;
209 *pwc++ = 0xd800 | (uc >> 10);
210 *pwc++ = 0xdc00 | (uc & 0x3ff);
211 }
212 }
213 else /* invalid encoding. */
214 pwc++;
215 }
216 }
217 return pwsz;
218}
219RT_EXPORT_SYMBOL(RTUtf16ToLower);
220
221
222RTDECL(PRTUTF16) RTUtf16ToUpper(PRTUTF16 pwsz)
223{
224 PRTUTF16 pwc = pwsz;
225 for (;;)
226 {
227 RTUTF16 wc = *pwc;
228 if (!wc)
229 break;
230 if (wc < 0xd800 || wc >= 0xdc00)
231 *pwc++ = RTUniCpToUpper(wc);
232 else
233 {
234 /* surrogate */
235 RTUTF16 wc2 = pwc[1];
236 if (wc2 >= 0xdc00 && wc2 <= 0xdfff)
237 {
238 RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff));
239 RTUNICP ucFolded = RTUniCpToUpper(uc);
240 if (uc != ucFolded && ucFolded >= 0x10000) /* we don't support shrinking the string */
241 {
242 uc -= 0x10000;
243 *pwc++ = 0xd800 | (uc >> 10);
244 *pwc++ = 0xdc00 | (uc & 0x3ff);
245 }
246 }
247 else /* invalid encoding. */
248 pwc++;
249 }
250 }
251 return pwsz;
252}
253RT_EXPORT_SYMBOL(RTUtf16ToUpper);
254
255
256/**
257 * Validate the UTF-16 encoding and calculates the length of an UTF-8 encoding.
258 *
259 * @returns iprt status code.
260 * @param pwsz The UTF-16 string.
261 * @param cwc The max length of the UTF-16 string to consider.
262 * @param pcch Where to store the length (excluding '\\0') of the UTF-8 string. (cch == cb, btw)
263 */
264static int rtUtf16CalcUtf8Length(PCRTUTF16 pwsz, size_t cwc, size_t *pcch)
265{
266 int rc = VINF_SUCCESS;
267 size_t cch = 0;
268 while (cwc > 0)
269 {
270 RTUTF16 wc = *pwsz++; cwc--;
271 if (!wc)
272 break;
273 else if (wc < 0xd800 || wc > 0xdfff)
274 {
275 if (wc < 0x80)
276 cch++;
277 else if (wc < 0x800)
278 cch += 2;
279 else if (wc < 0xfffe)
280 cch += 3;
281 else
282 {
283 RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc));
284 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
285 break;
286 }
287 }
288 else
289 {
290 if (wc >= 0xdc00)
291 {
292 RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc));
293 rc = VERR_INVALID_UTF16_ENCODING;
294 break;
295 }
296 if (cwc <= 0)
297 {
298 RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc));
299 rc = VERR_INVALID_UTF16_ENCODING;
300 break;
301 }
302 wc = *pwsz++; cwc--;
303 if (wc < 0xdc00 || wc > 0xdfff)
304 {
305 RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc));
306 rc = VERR_INVALID_UTF16_ENCODING;
307 break;
308 }
309 cch += 4;
310 }
311 }
312
313
314 /* done */
315 *pcch = cch;
316 return rc;
317}
318
319
320/**
321 * Recodes an valid UTF-16 string as UTF-8.
322 *
323 * @returns iprt status code.
324 * @param pwsz The UTF-16 string.
325 * @param cwc The number of RTUTF16 characters to process from pwsz. The recoding
326 * will stop when cwc or '\\0' is reached.
327 * @param psz Where to store the UTF-8 string.
328 * @param cch The size of the UTF-8 buffer, excluding the terminator.
329 * @param pcch Where to store the number of octets actually encoded.
330 */
331static int rtUtf16RecodeAsUtf8(PCRTUTF16 pwsz, size_t cwc, char *psz, size_t cch, size_t *pcch)
332{
333 unsigned char *pwch = (unsigned char *)psz;
334 int rc = VINF_SUCCESS;
335 while (cwc > 0)
336 {
337 RTUTF16 wc = *pwsz++; cwc--;
338 if (!wc)
339 break;
340 else if (wc < 0xd800 || wc > 0xdfff)
341 {
342 if (wc < 0x80)
343 {
344 if (cch < 1)
345 {
346 RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
347 rc = VERR_BUFFER_OVERFLOW;
348 break;
349 }
350 cch--;
351 *pwch++ = (unsigned char)wc;
352 }
353 else if (wc < 0x800)
354 {
355 if (cch < 2)
356 {
357 RTStrAssertMsgFailed(("Buffer overflow! 2\n"));
358 rc = VERR_BUFFER_OVERFLOW;
359 break;
360 }
361 cch -= 2;
362 *pwch++ = 0xc0 | (wc >> 6);
363 *pwch++ = 0x80 | (wc & 0x3f);
364 }
365 else if (wc < 0xfffe)
366 {
367 if (cch < 3)
368 {
369 RTStrAssertMsgFailed(("Buffer overflow! 3\n"));
370 rc = VERR_BUFFER_OVERFLOW;
371 break;
372 }
373 cch -= 3;
374 *pwch++ = 0xe0 | (wc >> 12);
375 *pwch++ = 0x80 | ((wc >> 6) & 0x3f);
376 *pwch++ = 0x80 | (wc & 0x3f);
377 }
378 else
379 {
380 RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc));
381 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
382 break;
383 }
384 }
385 else
386 {
387 if (wc >= 0xdc00)
388 {
389 RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc));
390 rc = VERR_INVALID_UTF16_ENCODING;
391 break;
392 }
393 if (cwc <= 0)
394 {
395 RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc));
396 rc = VERR_INVALID_UTF16_ENCODING;
397 break;
398 }
399 RTUTF16 wc2 = *pwsz++; cwc--;
400 if (wc2 < 0xdc00 || wc2 > 0xdfff)
401 {
402 RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc));
403 rc = VERR_INVALID_UTF16_ENCODING;
404 break;
405 }
406 uint32_t CodePoint = 0x10000
407 + ( ((wc & 0x3ff) << 10)
408 | (wc2 & 0x3ff));
409 if (cch < 4)
410 {
411 RTStrAssertMsgFailed(("Buffer overflow! 4\n"));
412 rc = VERR_BUFFER_OVERFLOW;
413 break;
414 }
415 cch -= 4;
416 *pwch++ = 0xf0 | (CodePoint >> 18);
417 *pwch++ = 0x80 | ((CodePoint >> 12) & 0x3f);
418 *pwch++ = 0x80 | ((CodePoint >> 6) & 0x3f);
419 *pwch++ = 0x80 | (CodePoint & 0x3f);
420 }
421 }
422
423 /* done */
424 *pwch = '\0';
425 *pcch = (char *)pwch - psz;
426 return rc;
427}
428
429
430
431RTDECL(int) RTUtf16ToUtf8(PCRTUTF16 pwszString, char **ppszString)
432{
433 /*
434 * Validate input.
435 */
436 Assert(VALID_PTR(ppszString));
437 Assert(VALID_PTR(pwszString));
438 *ppszString = NULL;
439
440 /*
441 * Validate the UTF-16 string and calculate the length of the UTF-8 encoding of it.
442 */
443 size_t cch;
444 int rc = rtUtf16CalcUtf8Length(pwszString, RTSTR_MAX, &cch);
445 if (RT_SUCCESS(rc))
446 {
447 /*
448 * Allocate buffer and recode it.
449 */
450 char *pszResult = (char *)RTMemAlloc(cch + 1);
451 if (pszResult)
452 {
453 rc = rtUtf16RecodeAsUtf8(pwszString, RTSTR_MAX, pszResult, cch, &cch);
454 if (RT_SUCCESS(rc))
455 {
456 *ppszString = pszResult;
457 return rc;
458 }
459
460 RTMemFree(pszResult);
461 }
462 else
463 rc = VERR_NO_STR_MEMORY;
464 }
465 return rc;
466}
467RT_EXPORT_SYMBOL(RTUtf16ToUtf8);
468
469
470RTDECL(int) RTUtf16ToUtf8Ex(PCRTUTF16 pwszString, size_t cwcString, char **ppsz, size_t cch, size_t *pcch)
471{
472 /*
473 * Validate input.
474 */
475 Assert(VALID_PTR(pwszString));
476 Assert(VALID_PTR(ppsz));
477 Assert(!pcch || VALID_PTR(pcch));
478
479 /*
480 * Validate the UTF-16 string and calculate the length of the UTF-8 encoding of it.
481 */
482 size_t cchResult;
483 int rc = rtUtf16CalcUtf8Length(pwszString, cwcString, &cchResult);
484 if (RT_SUCCESS(rc))
485 {
486 if (pcch)
487 *pcch = cchResult;
488
489 /*
490 * Check buffer size / Allocate buffer and recode it.
491 */
492 bool fShouldFree;
493 char *pszResult;
494 if (cch > 0 && *ppsz)
495 {
496 fShouldFree = false;
497 if (cch <= cchResult)
498 return VERR_BUFFER_OVERFLOW;
499 pszResult = *ppsz;
500 }
501 else
502 {
503 *ppsz = NULL;
504 fShouldFree = true;
505 cch = RT_MAX(cch, cchResult + 1);
506 pszResult = (char *)RTMemAlloc(cch);
507 }
508 if (pszResult)
509 {
510 rc = rtUtf16RecodeAsUtf8(pwszString, cwcString, pszResult, cch - 1, &cch);
511 if (RT_SUCCESS(rc))
512 {
513 *ppsz = pszResult;
514 return rc;
515 }
516
517 if (fShouldFree)
518 RTMemFree(pszResult);
519 }
520 else
521 rc = VERR_NO_STR_MEMORY;
522 }
523 return rc;
524}
525RT_EXPORT_SYMBOL(RTUtf16ToUtf8Ex);
526
527
528RTDECL(size_t) RTUtf16CalcUtf8Len(PCRTUTF16 pwsz)
529{
530 size_t cch;
531 int rc = rtUtf16CalcUtf8Length(pwsz, RTSTR_MAX, &cch);
532 return RT_SUCCESS(rc) ? cch : 0;
533}
534RT_EXPORT_SYMBOL(RTUtf16CalcUtf8Len);
535
536
537RTDECL(int) RTUtf16CalcUtf8LenEx(PCRTUTF16 pwsz, size_t cwc, size_t *pcch)
538{
539 size_t cch;
540 int rc = rtUtf16CalcUtf8Length(pwsz, cwc, &cch);
541 if (pcch)
542 *pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
543 return rc;
544}
545RT_EXPORT_SYMBOL(RTUtf16CalcUtf8LenEx);
546
547
548RTDECL(RTUNICP) RTUtf16GetCpInternal(PCRTUTF16 pwsz)
549{
550 const RTUTF16 wc = *pwsz;
551
552 /* simple */
553 if (wc < 0xd800 || (wc > 0xdfff && wc < 0xfffe))
554 return wc;
555 if (wc < 0xfffe)
556 {
557 /* surrogate pair */
558 if (wc < 0xdc00)
559 {
560 const RTUTF16 wc2 = pwsz[1];
561 if (wc2 >= 0xdc00 && wc2 <= 0xdfff)
562 {
563 RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff));
564 return uc;
565 }
566
567 RTStrAssertMsgFailed(("wc=%#08x wc2=%#08x - invalid 2nd char in surrogate pair\n", wc, wc2));
568 }
569 else
570 RTStrAssertMsgFailed(("wc=%#08x - invalid surrogate pair order\n", wc));
571 }
572 else
573 RTStrAssertMsgFailed(("wc=%#08x - endian indicator\n", wc));
574 return RTUNICP_INVALID;
575}
576RT_EXPORT_SYMBOL(RTUtf16GetCpInternal);
577
578
579RTDECL(int) RTUtf16GetCpExInternal(PCRTUTF16 *ppwsz, PRTUNICP pCp)
580{
581 const RTUTF16 wc = **ppwsz;
582
583 /* simple */
584 if (wc < 0xd800 || (wc > 0xdfff && wc < 0xfffe))
585 {
586 (*ppwsz)++;
587 *pCp = wc;
588 return VINF_SUCCESS;
589 }
590
591 int rc;
592 if (wc < 0xfffe)
593 {
594 /* surrogate pair */
595 if (wc < 0xdc00)
596 {
597 const RTUTF16 wc2 = (*ppwsz)[1];
598 if (wc2 >= 0xdc00 && wc2 <= 0xdfff)
599 {
600 RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff));
601 *pCp = uc;
602 (*ppwsz) += 2;
603 return VINF_SUCCESS;
604 }
605
606 RTStrAssertMsgFailed(("wc=%#08x wc2=%#08x - invalid 2nd char in surrogate pair\n", wc, wc2));
607 }
608 else
609 RTStrAssertMsgFailed(("wc=%#08x - invalid surrogate pair order\n", wc));
610 rc = VERR_INVALID_UTF16_ENCODING;
611 }
612 else
613 {
614 RTStrAssertMsgFailed(("wc=%#08x - endian indicator\n", wc));
615 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
616 }
617 *pCp = RTUNICP_INVALID;
618 (*ppwsz)++;
619 return rc;
620}
621RT_EXPORT_SYMBOL(RTUtf16GetCpExInternal);
622
623
624RTDECL(PRTUTF16) RTUtf16PutCpInternal(PRTUTF16 pwsz, RTUNICP CodePoint)
625{
626 /* simple */
627 if ( CodePoint < 0xd800
628 || ( CodePoint > 0xdfff
629 && CodePoint < 0xfffe))
630 {
631 *pwsz++ = (RTUTF16)CodePoint;
632 return pwsz;
633 }
634
635 /* surrogate pair */
636 if (CodePoint >= 0x10000 && CodePoint <= 0x0010ffff)
637 {
638 CodePoint -= 0x10000;
639 *pwsz++ = 0xd800 | (CodePoint >> 10);
640 *pwsz++ = 0xdc00 | (CodePoint & 0x3ff);
641 return pwsz;
642 }
643
644 /* invalid code point. */
645 RTStrAssertMsgFailed(("Invalid codepoint %#x\n", CodePoint));
646 *pwsz++ = 0x7f;
647 return pwsz;
648}
649RT_EXPORT_SYMBOL(RTUtf16PutCpInternal);
650
651
652/**
653 * Validate the UTF-16 encoding and calculates the length of a Latin1 encoding.
654 *
655 * @returns iprt status code.
656 * @param pwsz The UTF-16 string.
657 * @param cwc The max length of the UTF-16 string to consider.
658 * @param pcch Where to store the length (excluding '\\0') of the Latin1 string. (cch == cb, btw)
659 */
660static int rtUtf16CalcLatin1Length(PCRTUTF16 pwsz, size_t cwc, size_t *pcch)
661{
662 int rc = VINF_SUCCESS;
663 size_t cch = 0;
664 while (cwc > 0)
665 {
666 RTUTF16 wc = *pwsz++; cwc--;
667 if (!wc)
668 break;
669 else if (wc < 0xd800 || wc > 0xdfff)
670 {
671 if (wc < 0xfffe)
672 ++cch;
673 else
674 {
675 RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc));
676 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
677 break;
678 }
679 }
680 else
681 {
682 if (wc >= 0xdc00)
683 {
684 RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc));
685 rc = VERR_INVALID_UTF16_ENCODING;
686 break;
687 }
688 if (cwc <= 0)
689 {
690 RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc));
691 rc = VERR_INVALID_UTF16_ENCODING;
692 break;
693 }
694 wc = *pwsz++; cwc--;
695 if (wc < 0xdc00 || wc > 0xdfff)
696 {
697 RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc));
698 rc = VERR_INVALID_UTF16_ENCODING;
699 break;
700 }
701 ++cch;
702 }
703 }
704
705
706 /* done */
707 *pcch = cch;
708 return rc;
709}
710
711
712/**
713 * Recodes an valid UTF-16 string as Latin1.
714 *
715 * @returns iprt status code.
716 * @param pwsz The UTF-16 string.
717 * @param cwc The number of RTUTF16 characters to process from pwsz. The recoding
718 * will stop when cwc or '\\0' is reached.
719 * @param psz Where to store the Latin1 string.
720 * @param cch The size of the Latin1 buffer, excluding the terminator.
721 * @param pcch Where to store the number of octets actually encoded.
722 */
723static int rtUtf16RecodeAsLatin1(PCRTUTF16 pwsz, size_t cwc, char *psz, size_t cch, size_t *pcch)
724{
725 unsigned char *pwch = (unsigned char *)psz;
726 int rc = VINF_SUCCESS;
727 while (cwc > 0)
728 {
729 RTUTF16 wc = *pwsz++; cwc--;
730 if (!wc)
731 break;
732 else if (wc < 0xd800 || wc > 0xdfff)
733 {
734 if (wc < 0x100)
735 {
736 if (cch < 1)
737 {
738 RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
739 rc = VERR_BUFFER_OVERFLOW;
740 break;
741 }
742 cch--;
743 *pwch++ = (char)wc;
744 }
745 else if (wc < 0xfffe)
746 {
747 if (cch < 1)
748 {
749 RTStrAssertMsgFailed(("Buffer overflow! 3\n"));
750 rc = VERR_BUFFER_OVERFLOW;
751 break;
752 }
753 cch--;
754 *pwch++ = '?';
755 }
756 else
757 {
758 RTStrAssertMsgFailed(("endian indicator! wc=%#x\n", wc));
759 rc = VERR_CODE_POINT_ENDIAN_INDICATOR;
760 break;
761 }
762 }
763 else
764 {
765 if (wc >= 0xdc00)
766 {
767 RTStrAssertMsgFailed(("Wrong 1st char in surrogate! wc=%#x\n", wc));
768 rc = VERR_INVALID_UTF16_ENCODING;
769 break;
770 }
771 if (cwc <= 0)
772 {
773 RTStrAssertMsgFailed(("Invalid length! wc=%#x\n", wc));
774 rc = VERR_INVALID_UTF16_ENCODING;
775 break;
776 }
777 RTUTF16 wc2 = *pwsz++; cwc--;
778 if (wc2 < 0xdc00 || wc2 > 0xdfff)
779 {
780 RTStrAssertMsgFailed(("Wrong 2nd char in surrogate! wc=%#x\n", wc));
781 rc = VERR_INVALID_UTF16_ENCODING;
782 break;
783 }
784 if (cch < 1)
785 {
786 RTStrAssertMsgFailed(("Buffer overflow! 4\n"));
787 rc = VERR_BUFFER_OVERFLOW;
788 break;
789 }
790 cch--;
791 *pwch++ = '?';
792 }
793 }
794
795 /* done */
796 *pwch = '\0';
797 *pcch = (char *)pwch - psz;
798 return rc;
799}
800
801
802RTDECL(int) RTUtf16ToLatin1(PCRTUTF16 pwszString, char **ppszString)
803{
804 /*
805 * Validate input.
806 */
807 Assert(VALID_PTR(ppszString));
808 Assert(VALID_PTR(pwszString));
809 *ppszString = NULL;
810
811 /*
812 * Validate the UTF-16 string and calculate the length of the UTF-8 encoding of it.
813 */
814 size_t cch;
815 int rc = rtUtf16CalcLatin1Length(pwszString, RTSTR_MAX, &cch);
816 if (RT_SUCCESS(rc))
817 {
818 /*
819 * Allocate buffer and recode it.
820 */
821 char *pszResult = (char *)RTMemAlloc(cch + 1);
822 if (pszResult)
823 {
824 rc = rtUtf16RecodeAsLatin1(pwszString, RTSTR_MAX, pszResult, cch, &cch);
825 if (RT_SUCCESS(rc))
826 {
827 *ppszString = pszResult;
828 return rc;
829 }
830
831 RTMemFree(pszResult);
832 }
833 else
834 rc = VERR_NO_STR_MEMORY;
835 }
836 return rc;
837}
838RT_EXPORT_SYMBOL(RTUtf16ToLatin1);
839
840
841RTDECL(int) RTUtf16ToLatin1Ex(PCRTUTF16 pwszString, size_t cwcString, char **ppsz, size_t cch, size_t *pcch)
842{
843 /*
844 * Validate input.
845 */
846 Assert(VALID_PTR(pwszString));
847 Assert(VALID_PTR(ppsz));
848 Assert(!pcch || VALID_PTR(pcch));
849
850 /*
851 * Validate the UTF-16 string and calculate the length of the Latin1 encoding of it.
852 */
853 size_t cchResult;
854 int rc = rtUtf16CalcLatin1Length(pwszString, cwcString, &cchResult);
855 if (RT_SUCCESS(rc))
856 {
857 if (pcch)
858 *pcch = cchResult;
859
860 /*
861 * Check buffer size / Allocate buffer and recode it.
862 */
863 bool fShouldFree;
864 char *pszResult;
865 if (cch > 0 && *ppsz)
866 {
867 fShouldFree = false;
868 if (cch <= cchResult)
869 return VERR_BUFFER_OVERFLOW;
870 pszResult = *ppsz;
871 }
872 else
873 {
874 *ppsz = NULL;
875 fShouldFree = true;
876 cch = RT_MAX(cch, cchResult + 1);
877 pszResult = (char *)RTMemAlloc(cch);
878 }
879 if (pszResult)
880 {
881 rc = rtUtf16RecodeAsLatin1(pwszString, cwcString, pszResult, cch - 1, &cch);
882 if (RT_SUCCESS(rc))
883 {
884 *ppsz = pszResult;
885 return rc;
886 }
887
888 if (fShouldFree)
889 RTMemFree(pszResult);
890 }
891 else
892 rc = VERR_NO_STR_MEMORY;
893 }
894 return rc;
895}
896RT_EXPORT_SYMBOL(RTUtf16ToLatin1Ex);
897
898
899RTDECL(size_t) RTUtf16CalcLatin1Len(PCRTUTF16 pwsz)
900{
901 size_t cch;
902 int rc = rtUtf16CalcLatin1Length(pwsz, RTSTR_MAX, &cch);
903 return RT_SUCCESS(rc) ? cch : 0;
904}
905RT_EXPORT_SYMBOL(RTUtf16CalcLatin1Len);
906
907
908RTDECL(int) RTUtf16CalcLatin1LenEx(PCRTUTF16 pwsz, size_t cwc, size_t *pcch)
909{
910 size_t cch;
911 int rc = rtUtf16CalcLatin1Length(pwsz, cwc, &cch);
912 if (pcch)
913 *pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
914 return rc;
915}
916RT_EXPORT_SYMBOL(RTUtf16CalcLatin1LenEx);
917
918
919/**
920 * Calculates the UTF-16 length of a Latin1 string. In fact this is just the
921 * original length, but the function saves us nasty comments to that effect
922 * all over the place.
923 *
924 * @returns IPRT status code.
925 * @param psz Pointer to the Latin1 string.
926 * @param cch The max length of the string. (btw cch = cb)
927 * Use RTSTR_MAX if all of the string is to be examined.s
928 * @param pcwc Where to store the length of the UTF-16 string as a number of RTUTF16 characters.
929 */
930static int rtLatin1CalcUtf16Length(const char *psz, size_t cch, size_t *pcwc)
931{
932 *pcwc = RTStrNLen(psz, cch);
933 return VINF_SUCCESS;
934}
935
936
937/**
938 * Recodes a Latin1 string as UTF-16. This is just a case of expanding it to
939 * sixteen bits, as Unicode is a superset of Latin1.
940 *
941 * Since we know the input is valid, we do *not* perform length checks.
942 *
943 * @returns iprt status code.
944 * @param psz The Latin1 string to recode.
945 * @param cch The number of chars (the type char, so bytes if you like) to process of the Latin1 string.
946 * The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
947 * @param pwsz Where to store the UTF-16 string.
948 * @param cwc The number of RTUTF16 items the pwsz buffer can hold, excluding the terminator ('\\0').
949 * @param pcwc Where to store the actual number of RTUTF16 items encoded into the UTF-16. This excludes the terminator.
950 */
951static int rtLatin1RecodeAsUtf16(const char *psz, size_t cch, PRTUTF16 pwsz, size_t cwc, size_t *pcwc)
952{
953 int rc = VINF_SUCCESS;
954 const unsigned char *puch = (const unsigned char *)psz;
955 const PRTUTF16 pwszEnd = pwsz + cwc;
956 PRTUTF16 pwc = pwsz;
957 Assert(pwszEnd >= pwc);
958 while (cch > 0)
959 {
960 /* read the next char and check for terminator. */
961 const unsigned char uch = *puch;
962 if (!uch)
963 break;
964
965 /* check for output overflow */
966 if (pwc >= pwszEnd)
967 {
968 rc = VERR_BUFFER_OVERFLOW;
969 break;
970 }
971
972 /* expand the code point */
973 *pwc++ = uch;
974 puch++;
975 cch--;
976 }
977
978 /* done */
979 *pwc = '\0';
980 *pcwc = pwc - pwsz;
981 return rc;
982}
983
984
985RTDECL(int) RTLatin1ToUtf16(const char *pszString, PRTUTF16 *ppwszString)
986{
987 /*
988 * Validate input.
989 */
990 Assert(VALID_PTR(ppwszString));
991 Assert(VALID_PTR(pszString));
992 *ppwszString = NULL;
993
994 /*
995 * Validate the input and calculate the length of the UTF-16 string.
996 */
997 size_t cwc;
998 int rc = rtLatin1CalcUtf16Length(pszString, RTSTR_MAX, &cwc);
999 if (RT_SUCCESS(rc))
1000 {
1001 /*
1002 * Allocate buffer.
1003 */
1004 PRTUTF16 pwsz = (PRTUTF16)RTMemAlloc((cwc + 1) * sizeof(RTUTF16));
1005 if (pwsz)
1006 {
1007 /*
1008 * Encode the UTF-16 string.
1009 */
1010 rc = rtLatin1RecodeAsUtf16(pszString, RTSTR_MAX, pwsz, cwc, &cwc);
1011 if (RT_SUCCESS(rc))
1012 {
1013 *ppwszString = pwsz;
1014 return rc;
1015 }
1016 RTMemFree(pwsz);
1017 }
1018 else
1019 rc = VERR_NO_UTF16_MEMORY;
1020 }
1021 return rc;
1022}
1023RT_EXPORT_SYMBOL(RTLatin1ToUtf16);
1024
1025
1026RTDECL(int) RTLatin1ToUtf16Ex(const char *pszString, size_t cchString, PRTUTF16 *ppwsz, size_t cwc, size_t *pcwc)
1027{
1028 /*
1029 * Validate input.
1030 */
1031 Assert(VALID_PTR(pszString));
1032 Assert(VALID_PTR(ppwsz));
1033 Assert(!pcwc || VALID_PTR(pcwc));
1034
1035 /*
1036 * Validate the input and calculate the length of the UTF-16 string.
1037 */
1038 size_t cwcResult;
1039 int rc = rtLatin1CalcUtf16Length(pszString, cchString, &cwcResult);
1040 if (RT_SUCCESS(rc))
1041 {
1042 if (pcwc)
1043 *pcwc = cwcResult;
1044
1045 /*
1046 * Check buffer size / Allocate buffer.
1047 */
1048 bool fShouldFree;
1049 PRTUTF16 pwszResult;
1050 if (cwc > 0 && *ppwsz)
1051 {
1052 fShouldFree = false;
1053 if (cwc <= cwcResult)
1054 return VERR_BUFFER_OVERFLOW;
1055 pwszResult = *ppwsz;
1056 }
1057 else
1058 {
1059 *ppwsz = NULL;
1060 fShouldFree = true;
1061 cwc = RT_MAX(cwcResult + 1, cwc);
1062 pwszResult = (PRTUTF16)RTMemAlloc(cwc * sizeof(RTUTF16));
1063 }
1064 if (pwszResult)
1065 {
1066 /*
1067 * Encode the UTF-16 string.
1068 */
1069 rc = rtLatin1RecodeAsUtf16(pszString, cchString, pwszResult, cwc - 1, &cwcResult);
1070 if (RT_SUCCESS(rc))
1071 {
1072 *ppwsz = pwszResult;
1073 return rc;
1074 }
1075 if (fShouldFree)
1076 RTMemFree(pwszResult);
1077 }
1078 else
1079 rc = VERR_NO_UTF16_MEMORY;
1080 }
1081 return rc;
1082}
1083RT_EXPORT_SYMBOL(RTLatin1ToUtf16Ex);
1084
1085
1086RTDECL(size_t) RTLatin1CalcUtf16Len(const char *psz)
1087{
1088 size_t cwc;
1089 int rc = rtLatin1CalcUtf16Length(psz, RTSTR_MAX, &cwc);
1090 return RT_SUCCESS(rc) ? cwc : 0;
1091}
1092RT_EXPORT_SYMBOL(RTLatin1CalcUtf16Len);
1093
1094
1095RTDECL(int) RTLatin1CalcUtf16LenEx(const char *psz, size_t cch, size_t *pcwc)
1096{
1097 size_t cwc;
1098 int rc = rtLatin1CalcUtf16Length(psz, cch, &cwc);
1099 if (pcwc)
1100 *pcwc = RT_SUCCESS(rc) ? cwc : ~(size_t)0;
1101 return rc;
1102}
1103RT_EXPORT_SYMBOL(RTLatin1CalcUtf16LenEx);
Note: See TracBrowser for help on using the repository browser.

© 2025 Oracle Support Privacy / Do Not Sell My Info Terms of Use Trademark Policy Automated Access Etiquette