utf-8.cpp@ 31418

Last change on this file since 31418 was 31418, checked in by vboxsync, 15 years ago
iprt/string.h,utf-8.cpp: avoid including uni.h; misc nits.
Property svn:eol-style set to `native` Property svn:keywords set to `Id`
File size: 53.7 KB

Line
1	/* $Id: utf-8.cpp 31418 2010-08-05 17:37:13Z vboxsync $ */
2	/** @file
3	* IPRT - UTF-8 Decoding.
4	*/
5
6	/*
7	* Copyright (C) 2006-2010 Oracle Corporation
8	*
9	* This file is part of VirtualBox Open Source Edition (OSE), as
10	* available from http://www.215389.xyz. This file is free software;
11	* you can redistribute it and/or modify it under the terms of the GNU
12	* General Public License (GPL) as published by the Free Software
13	* Foundation, in version 2 as it comes in the "COPYING" file of the
14	* VirtualBox OSE distribution. VirtualBox OSE is distributed in the
15	* hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
16	*
17	* The contents of this file may alternatively be used under the terms
18	* of the Common Development and Distribution License Version 1.0
19	* (CDDL) only, as it comes in the "COPYING.CDDL" file of the
20	* VirtualBox OSE distribution, in which case the provisions of the
21	* CDDL are applicable instead of those of the GPL.
22	*
23	* You may elect to license modified versions of this file under the
24	* terms and conditions of either the GPL or the CDDL or both.
25	*/
26
27
28	/*******************************************************************************
29	* Header Files *
30	*******************************************************************************/
31	#include <iprt/string.h>
32	#include "internal/iprt.h"
33
34	#include <iprt/uni.h>
35	#include <iprt/alloc.h>
36	#include <iprt/assert.h>
37	#include <iprt/err.h>
38	#include "internal/string.h"
39
40
41
42	/**
43	* Get get length in code points of a UTF-8 encoded string.
44	* The string is validated while doing this.
45	*
46	* @returns IPRT status code.
47	* @param psz Pointer to the UTF-8 string.
48	* @param cch The max length of the string. (btw cch = cb)
49	* Use RTSTR_MAX if all of the string is to be examined.
50	* @param pcuc Where to store the length in unicode code points.
51	* @param pcchActual Where to store the actual size of the UTF-8 string
52	* on success (cch = cb again). Optional.
53	*/
54	int rtUtf8Length(const char psz, size_t cch, size_t pcuc, size_t *pcchActual)
55	{
56	const unsigned char puch = (const unsigned char )psz;
57	size_t cCodePoints = 0;
58	while (cch > 0)
59	{
60	const unsigned char uch = *puch;
61	if (!uch)
62	break;
63	if (uch & RT_BIT(7))
64	{
65	/* figure sequence length and validate the first byte */
66	/** @todo RT_USE_RTC_3629 */
67	unsigned cb;
68	if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5))) == (RT_BIT(7) \| RT_BIT(6)))
69	cb = 2;
70	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5)))
71	cb = 3;
72	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4)))
73	cb = 4;
74	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3)))
75	cb = 5;
76	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2) \| RT_BIT(1))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2)))
77	cb = 6;
78	else
79	{
80	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
81	return VERR_INVALID_UTF8_ENCODING;
82	}
83
84	/* check length */
85	if (cb > cch)
86	{
87	RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
88	return VERR_INVALID_UTF8_ENCODING;
89	}
90
91	/* validate the rest */
92	switch (cb)
93	{
94	case 6:
95	RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
96	case 5:
97	RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
98	case 4:
99	RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
100	case 3:
101	RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
102	case 2:
103	RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
104	break;
105	}
106
107	/* validate the code point. */
108	RTUNICP uc;
109	switch (cb)
110	{
111	case 6:
112	uc = (puch[5] & 0x3f)
113	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
114	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
115	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
116	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
117	\| ((RTUNICP)(uch & 0x01) << 30);
118	RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
119	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
120	break;
121	case 5:
122	uc = (puch[4] & 0x3f)
123	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
124	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
125	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
126	\| ((RTUNICP)(uch & 0x03) << 24);
127	RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
128	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
129	break;
130	case 4:
131	uc = (puch[3] & 0x3f)
132	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
133	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
134	\| ((RTUNICP)(uch & 0x07) << 18);
135	RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
136	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
137	break;
138	case 3:
139	uc = (puch[2] & 0x3f)
140	\| ((RTUNICP)(puch[1] & 0x3f) << 6)
141	\| ((RTUNICP)(uch & 0x0f) << 12);
142	RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
143	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
144	uc == 0xffff \|\| uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
145	RTStrAssertMsgReturn(uc < 0xd800 \|\| uc > 0xdfff,
146	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
147	break;
148	case 2:
149	uc = (puch[1] & 0x3f)
150	\| ((RTUNICP)(uch & 0x1f) << 6);
151	RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
152	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
153	break;
154	}
155
156	/* advance */
157	cch -= cb;
158	puch += cb;
159	}
160	else
161	{
162	/* one ASCII byte */
163	puch++;
164	cch--;
165	}
166	cCodePoints++;
167	}
168
169	/* done */
170	*pcuc = cCodePoints;
171	if (pcchActual)
172	pcchActual = puch - (unsigned char const )psz;
173	return VINF_SUCCESS;
174	}
175
176
177	/**
178	* Decodes and UTF-8 string into an array of unicode code point.
179	*
180	* Since we know the input is valid, we do not perform encoding or length checks.
181	*
182	* @returns iprt status code.
183	* @param psz The UTF-8 string to recode. This is a valid encoding.
184	* @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
185	* The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
186	* @param paCps Where to store the code points array.
187	* @param cCps The number of RTUNICP items the paCps buffer can hold, excluding the terminator ('\\0').
188	*/
189	static int rtUtf8Decode(const char *psz, size_t cch, PRTUNICP paCps, size_t cCps)
190	{
191	int rc = VINF_SUCCESS;
192	const unsigned char puch = (const unsigned char )psz;
193	PRTUNICP pCp = paCps;
194	while (cch > 0)
195	{
196	/* read the next char and check for terminator. */
197	const unsigned char uch = *puch;
198	if (!uch)
199	break;
200
201	/* check for output overflow */
202	if (RT_UNLIKELY(cCps < 1))
203	{
204	rc = VERR_BUFFER_OVERFLOW;
205	break;
206	}
207	cCps--;
208
209	/* decode and recode the code point */
210	if (!(uch & RT_BIT(7)))
211	{
212	*pCp++ = uch;
213	puch++;
214	cch--;
215	}
216	#ifdef RT_STRICT
217	else if (!(uch & RT_BIT(6)))
218	AssertMsgFailed(("Internal error!\n"));
219	#endif
220	else if (!(uch & RT_BIT(5)))
221	{
222	*pCp++ = (puch[1] & 0x3f)
223	\| ((uint16_t)(uch & 0x1f) << 6);
224	puch += 2;
225	cch -= 2;
226	}
227	else if (!(uch & RT_BIT(4)))
228	{
229	*pCp++ = (puch[2] & 0x3f)
230	\| ((uint16_t)(puch[1] & 0x3f) << 6)
231	\| ((uint16_t)(uch & 0x0f) << 12);
232	puch += 3;
233	cch -= 3;
234	}
235	else if (!(uch & RT_BIT(3)))
236	{
237	*pCp++ = (puch[3] & 0x3f)
238	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
239	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
240	\| ((RTUNICP)(uch & 0x07) << 18);
241	puch += 4;
242	cch -= 4;
243	}
244	else if (!(uch & RT_BIT(2)))
245	{
246	*pCp++ = (puch[4] & 0x3f)
247	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
248	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
249	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
250	\| ((RTUNICP)(uch & 0x03) << 24);
251	puch += 5;
252	cch -= 6;
253	}
254	else
255	{
256	Assert(!(uch & RT_BIT(1)));
257	*pCp++ = (puch[5] & 0x3f)
258	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
259	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
260	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
261	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
262	\| ((RTUNICP)(uch & 0x01) << 30);
263	puch += 6;
264	cch -= 6;
265	}
266	}
267
268	/* done */
269	*pCp = 0;
270	return rc;
271	}
272
273
274	RTDECL(size_t) RTStrUniLen(const char *psz)
275	{
276	size_t cCodePoints;
277	int rc = rtUtf8Length(psz, RTSTR_MAX, &cCodePoints, NULL);
278	return RT_SUCCESS(rc) ? cCodePoints : 0;
279	}
280	RT_EXPORT_SYMBOL(RTStrUniLen);
281
282
283	RTDECL(int) RTStrUniLenEx(const char psz, size_t cch, size_t pcCps)
284	{
285	size_t cCodePoints;
286	int rc = rtUtf8Length(psz, cch, &cCodePoints, NULL);
287	if (pcCps)
288	*pcCps = RT_SUCCESS(rc) ? cCodePoints : 0;
289	return rc;
290	}
291	RT_EXPORT_SYMBOL(RTStrUniLenEx);
292
293
294	RTDECL(int) RTStrValidateEncoding(const char *psz)
295	{
296	return RTStrValidateEncodingEx(psz, RTSTR_MAX, 0);
297	}
298	RT_EXPORT_SYMBOL(RTStrValidateEncoding);
299
300
301	RTDECL(int) RTStrValidateEncodingEx(const char *psz, size_t cch, uint32_t fFlags)
302	{
303	AssertReturn(!(fFlags & ~(RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)), VERR_INVALID_PARAMETER);
304	AssertPtr(psz);
305
306	/*
307	* Use rtUtf8Length for the job.
308	*/
309	size_t cchActual;
310	size_t cCpsIgnored;
311	int rc = rtUtf8Length(psz, cch, &cCpsIgnored, &cchActual);
312	if (RT_SUCCESS(rc))
313	{
314	if ( (fFlags & RTSTR_VALIDATE_ENCODING_ZERO_TERMINATED)
315	&& cchActual >= cch)
316	rc = VERR_BUFFER_OVERFLOW;
317	}
318	return rc;
319	}
320	RT_EXPORT_SYMBOL(RTStrValidateEncodingEx);
321
322
323	RTDECL(bool) RTStrIsValidEncoding(const char *psz)
324	{
325	int rc = RTStrValidateEncodingEx(psz, RTSTR_MAX, 0);
326	return RT_SUCCESS(rc);
327	}
328	RT_EXPORT_SYMBOL(RTStrIsValidEncoding);
329
330
331	RTDECL(size_t) RTStrPurgeEncoding(char *psz)
332	{
333	size_t cErrors = 0;
334	for (;;)
335	{
336	RTUNICP Cp;
337	int rc = RTStrGetCpEx((const char **)&psz, &Cp);
338	if (RT_SUCCESS(rc))
339	{
340	if (!Cp)
341	break;
342	}
343	else
344	{
345	psz[-1] = '?';
346	cErrors++;
347	}
348	}
349	return cErrors;
350	}
351	RT_EXPORT_SYMBOL(RTStrPurgeEncoding);
352
353
354	RTDECL(int) RTStrToUni(const char pszString, PRTUNICP ppaCps)
355	{
356	/*
357	* Validate input.
358	*/
359	Assert(VALID_PTR(pszString));
360	Assert(VALID_PTR(ppaCps));
361	*ppaCps = NULL;
362
363	/*
364	* Validate the UTF-8 input and count its code points.
365	*/
366	size_t cCps;
367	int rc = rtUtf8Length(pszString, RTSTR_MAX, &cCps, NULL);
368	if (RT_SUCCESS(rc))
369	{
370	/*
371	* Allocate buffer.
372	*/
373	PRTUNICP paCps = (PRTUNICP)RTMemAlloc((cCps + 1) * sizeof(RTUNICP));
374	if (paCps)
375	{
376	/*
377	* Decode the string.
378	*/
379	rc = rtUtf8Decode(pszString, RTSTR_MAX, paCps, cCps);
380	if (RT_SUCCESS(rc))
381	{
382	*ppaCps = paCps;
383	return rc;
384	}
385	RTMemFree(paCps);
386	}
387	else
388	rc = VERR_NO_CODE_POINT_MEMORY;
389	}
390	return rc;
391	}
392	RT_EXPORT_SYMBOL(RTStrToUni);
393
394
395	RTDECL(int) RTStrToUniEx(const char pszString, size_t cchString, PRTUNICP ppaCps, size_t cCps, size_t *pcCps)
396	{
397	/*
398	* Validate input.
399	*/
400	Assert(VALID_PTR(pszString));
401	Assert(VALID_PTR(ppaCps));
402	Assert(!pcCps \|\| VALID_PTR(pcCps));
403
404	/*
405	* Validate the UTF-8 input and count the code points.
406	*/
407	size_t cCpsResult;
408	int rc = rtUtf8Length(pszString, cchString, &cCpsResult, NULL);
409	if (RT_SUCCESS(rc))
410	{
411	if (pcCps)
412	*pcCps = cCpsResult;
413
414	/*
415	* Check buffer size / Allocate buffer.
416	*/
417	bool fShouldFree;
418	PRTUNICP paCpsResult;
419	if (cCps > 0 && *ppaCps)
420	{
421	fShouldFree = false;
422	if (cCps <= cCpsResult)
423	return VERR_BUFFER_OVERFLOW;
424	paCpsResult = *ppaCps;
425	}
426	else
427	{
428	*ppaCps = NULL;
429	fShouldFree = true;
430	cCps = RT_MAX(cCpsResult + 1, cCps);
431	paCpsResult = (PRTUNICP)RTMemAlloc(cCps * sizeof(RTUNICP));
432	}
433	if (paCpsResult)
434	{
435	/*
436	* Encode the UTF-16 string.
437	*/
438	rc = rtUtf8Decode(pszString, cchString, paCpsResult, cCps - 1);
439	if (RT_SUCCESS(rc))
440	{
441	*ppaCps = paCpsResult;
442	return rc;
443	}
444	if (fShouldFree)
445	RTMemFree(paCpsResult);
446	}
447	else
448	rc = VERR_NO_CODE_POINT_MEMORY;
449	}
450	return rc;
451	}
452	RT_EXPORT_SYMBOL(RTStrToUniEx);
453
454
455	/**
456	* Calculates the UTF-16 length of a string, validating the encoding while doing so.
457	*
458	* @returns IPRT status code.
459	* @param psz Pointer to the UTF-8 string.
460	* @param cch The max length of the string. (btw cch = cb)
461	* Use RTSTR_MAX if all of the string is to be examined.
462	* @param pcwc Where to store the length of the UTF-16 string as a number of RTUTF16 characters.
463	*/
464	static int rtUtf8CalcUtf16Length(const char psz, size_t cch, size_t pcwc)
465	{
466	const unsigned char puch = (const unsigned char )psz;
467	size_t cwc = 0;
468	while (cch > 0)
469	{
470	const unsigned char uch = *puch;
471	if (!uch)
472	break;
473	if (!(uch & RT_BIT(7)))
474	{
475	/* one ASCII byte */
476	cwc++;
477	puch++;
478	cch--;
479	}
480	else
481	{
482	/* figure sequence length and validate the first byte */
483	unsigned cb;
484	if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5))) == (RT_BIT(7) \| RT_BIT(6)))
485	cb = 2;
486	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5)))
487	cb = 3;
488	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4)))
489	cb = 4;
490	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3)))
491	cb = 5;
492	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2) \| RT_BIT(1))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3) \| RT_BIT(2)))
493	cb = 6;
494	else
495	{
496	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.*Rhxs\n", RT_MIN(cch, 10), puch));
497	return VERR_INVALID_UTF8_ENCODING;
498	}
499
500	/* check length */
501	if (cb > cch)
502	{
503	RTStrAssertMsgFailed(("Invalid UTF-8 length: cb=%d cch=%d (%.*Rhxs)\n", cb, cch, RT_MIN(cch, 10), puch));
504	return VERR_INVALID_UTF8_ENCODING;
505	}
506
507	/* validate the rest */
508	switch (cb)
509	{
510	case 6:
511	RTStrAssertMsgReturn((puch[5] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("6/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
512	case 5:
513	RTStrAssertMsgReturn((puch[4] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("5/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
514	case 4:
515	RTStrAssertMsgReturn((puch[3] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("4/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
516	case 3:
517	RTStrAssertMsgReturn((puch[2] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("3/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
518	case 2:
519	RTStrAssertMsgReturn((puch[1] & (RT_BIT(7) \| RT_BIT(6))) == RT_BIT(7), ("2/%u: %.*Rhxs\n", cb, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
520	break;
521	}
522
523	/* validate the code point. */
524	RTUNICP uc;
525	switch (cb)
526	{
527	case 6:
528	uc = (puch[5] & 0x3f)
529	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
530	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
531	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
532	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
533	\| ((RTUNICP)(uch & 0x01) << 30);
534	RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
535	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
536	RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
537	return VERR_CANT_RECODE_AS_UTF16;
538	case 5:
539	uc = (puch[4] & 0x3f)
540	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
541	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
542	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
543	\| ((RTUNICP)(uch & 0x03) << 24);
544	RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
545	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
546	RTStrAssertMsgFailed(("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch));
547	return VERR_CANT_RECODE_AS_UTF16;
548	case 4:
549	uc = (puch[3] & 0x3f)
550	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
551	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
552	\| ((RTUNICP)(uch & 0x07) << 18);
553	RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
554	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
555	RTStrAssertMsgReturn(uc <= 0x0010ffff,
556	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CANT_RECODE_AS_UTF16);
557	cwc++;
558	break;
559	case 3:
560	uc = (puch[2] & 0x3f)
561	\| ((RTUNICP)(puch[1] & 0x3f) << 6)
562	\| ((RTUNICP)(uch & 0x0f) << 12);
563	RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
564	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch),
565	uc == 0xffff \|\| uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING);
566	RTStrAssertMsgReturn(uc < 0xd800 \|\| uc > 0xdfff,
567	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_CODE_POINT_SURROGATE);
568	break;
569	case 2:
570	uc = (puch[1] & 0x3f)
571	\| ((RTUNICP)(uch & 0x1f) << 6);
572	RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
573	("%u: cp=%#010RX32: %.*Rhxs\n", cb, uc, RT_MIN(cb + 10, cch), puch), VERR_INVALID_UTF8_ENCODING);
574	break;
575	}
576
577	/* advance */
578	cch -= cb;
579	puch += cb;
580	cwc++;
581	}
582	}
583
584	/* done */
585	*pcwc = cwc;
586	return VINF_SUCCESS;
587	}
588
589
590	/**
591	* Recodes a valid UTF-8 string as UTF-16.
592	*
593	* Since we know the input is valid, we do not perform encoding or length checks.
594	*
595	* @returns iprt status code.
596	* @param psz The UTF-8 string to recode. This is a valid encoding.
597	* @param cch The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
598	* The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
599	* @param pwsz Where to store the UTF-16 string.
600	* @param cwc The number of RTUTF16 items the pwsz buffer can hold, excluding the terminator ('\\0').
601	*/
602	static int rtUtf8RecodeAsUtf16(const char *psz, size_t cch, PRTUTF16 pwsz, size_t cwc)
603	{
604	int rc = VINF_SUCCESS;
605	const unsigned char puch = (const unsigned char )psz;
606	PRTUTF16 pwc = pwsz;
607	while (cch > 0)
608	{
609	/* read the next char and check for terminator. */
610	const unsigned char uch = *puch;
611	if (!uch)
612	break;
613
614	/* check for output overflow */
615	if (RT_UNLIKELY(cwc < 1))
616	{
617	rc = VERR_BUFFER_OVERFLOW;
618	break;
619	}
620	cwc--;
621
622	/* decode and recode the code point */
623	if (!(uch & RT_BIT(7)))
624	{
625	*pwc++ = uch;
626	puch++;
627	cch--;
628	}
629	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5))) == (RT_BIT(7) \| RT_BIT(6)))
630	{
631	uint16_t uc = (puch[1] & 0x3f)
632	\| ((uint16_t)(uch & 0x1f) << 6);
633	*pwc++ = uc;
634	puch += 2;
635	cch -= 2;
636	}
637	else if ((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5)))
638	{
639	uint16_t uc = (puch[2] & 0x3f)
640	\| ((uint16_t)(puch[1] & 0x3f) << 6)
641	\| ((uint16_t)(uch & 0x0f) << 12);
642	*pwc++ = uc;
643	puch += 3;
644	cch -= 3;
645	}
646	else
647	{
648	/* generate surrugate pair */
649	Assert((uch & (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4) \| RT_BIT(3))) == (RT_BIT(7) \| RT_BIT(6) \| RT_BIT(5) \| RT_BIT(4)));
650	RTUNICP uc = (puch[3] & 0x3f)
651	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
652	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
653	\| ((RTUNICP)(uch & 0x07) << 18);
654	if (RT_UNLIKELY(cwc < 1))
655	{
656	rc = VERR_BUFFER_OVERFLOW;
657	break;
658	}
659	cwc--;
660
661	uc -= 0x10000;
662	*pwc++ = 0xd800 \| (uc >> 10);
663	*pwc++ = 0xdc00 \| (uc & 0x3ff);
664	puch += 4;
665	cch -= 4;
666	}
667	}
668
669	/* done */
670	*pwc = '\0';
671	return rc;
672	}
673
674
675	RTDECL(int) RTStrToUtf16Tag(const char pszString, PRTUTF16 ppwszString, const char *pszTag)
676	{
677	/*
678	* Validate input.
679	*/
680	Assert(VALID_PTR(ppwszString));
681	Assert(VALID_PTR(pszString));
682	*ppwszString = NULL;
683
684	/*
685	* Validate the UTF-8 input and calculate the length of the UTF-16 string.
686	*/
687	size_t cwc;
688	int rc = rtUtf8CalcUtf16Length(pszString, RTSTR_MAX, &cwc);
689	if (RT_SUCCESS(rc))
690	{
691	/*
692	* Allocate buffer.
693	*/
694	PRTUTF16 pwsz = (PRTUTF16)RTMemAllocTag((cwc + 1) * sizeof(RTUTF16), pszTag);
695	if (pwsz)
696	{
697	/*
698	* Encode the UTF-16 string.
699	*/
700	rc = rtUtf8RecodeAsUtf16(pszString, RTSTR_MAX, pwsz, cwc);
701	if (RT_SUCCESS(rc))
702	{
703	*ppwszString = pwsz;
704	return rc;
705	}
706	RTMemFree(pwsz);
707	}
708	else
709	rc = VERR_NO_UTF16_MEMORY;
710	}
711	return rc;
712	}
713	RT_EXPORT_SYMBOL(RTStrToUtf16Tag);
714
715
716	RTDECL(int) RTStrToUtf16ExTag(const char *pszString, size_t cchString,
717	PRTUTF16 ppwsz, size_t cwc, size_t pcwc, const char *pszTag)
718	{
719	/*
720	* Validate input.
721	*/
722	Assert(VALID_PTR(pszString));
723	Assert(VALID_PTR(ppwsz));
724	Assert(!pcwc \|\| VALID_PTR(pcwc));
725
726	/*
727	* Validate the UTF-8 input and calculate the length of the UTF-16 string.
728	*/
729	size_t cwcResult;
730	int rc = rtUtf8CalcUtf16Length(pszString, cchString, &cwcResult);
731	if (RT_SUCCESS(rc))
732	{
733	if (pcwc)
734	*pcwc = cwcResult;
735
736	/*
737	* Check buffer size / Allocate buffer.
738	*/
739	bool fShouldFree;
740	PRTUTF16 pwszResult;
741	if (cwc > 0 && *ppwsz)
742	{
743	fShouldFree = false;
744	if (cwc <= cwcResult)
745	return VERR_BUFFER_OVERFLOW;
746	pwszResult = *ppwsz;
747	}
748	else
749	{
750	*ppwsz = NULL;
751	fShouldFree = true;
752	cwc = RT_MAX(cwcResult + 1, cwc);
753	pwszResult = (PRTUTF16)RTMemAllocTag(cwc * sizeof(RTUTF16), pszTag);
754	}
755	if (pwszResult)
756	{
757	/*
758	* Encode the UTF-16 string.
759	*/
760	rc = rtUtf8RecodeAsUtf16(pszString, cchString, pwszResult, cwc - 1);
761	if (RT_SUCCESS(rc))
762	{
763	*ppwsz = pwszResult;
764	return rc;
765	}
766	if (fShouldFree)
767	RTMemFree(pwszResult);
768	}
769	else
770	rc = VERR_NO_UTF16_MEMORY;
771	}
772	return rc;
773	}
774	RT_EXPORT_SYMBOL(RTStrToUtf16ExTag);
775
776
777	RTDECL(size_t) RTStrCalcUtf16Len(const char *psz)
778	{
779	size_t cwc;
780	int rc = rtUtf8CalcUtf16Length(psz, RTSTR_MAX, &cwc);
781	return RT_SUCCESS(rc) ? cwc : 0;
782	}
783	RT_EXPORT_SYMBOL(RTStrCalcUtf16Len);
784
785
786	RTDECL(int) RTStrCalcUtf16LenEx(const char psz, size_t cch, size_t pcwc)
787	{
788	size_t cwc;
789	int rc = rtUtf8CalcUtf16Length(psz, cch, &cwc);
790	if (pcwc)
791	*pcwc = RT_SUCCESS(rc) ? cwc : ~(size_t)0;
792	return rc;
793	}
794	RT_EXPORT_SYMBOL(RTStrCalcUtf16LenEx);
795
796
797	/**
798	* Calculates the length of the UTF-8 encoding of a Latin-1 string.
799	*
800	* @returns iprt status code.
801	* @param psz The Latin-1 string.
802	* @param cchIn The max length of the Latin-1 string to consider.
803	* @param pcch Where to store the length (excluding '\\0') of the UTF-8 string. (cch == cb, btw)
804	*/
805	static int rtLatin1CalcUtf8Length(const char psz, size_t cchIn, size_t pcch)
806	{
807	size_t cch = 0;
808	for (;;)
809	{
810	RTUNICP Cp;
811	size_t cchCp;
812	int rc = RTLatin1GetCpNEx(&psz, &cchIn, &Cp);
813	if (Cp == 0 \|\| rc == VERR_END_OF_STRING)
814	break;
815	if (RT_FAILURE(rc))
816	return rc;
817	cch += RTStrCpSize(Cp); /* cannot fail */
818	}
819
820	/* done */
821	*pcch = cch;
822	return VINF_SUCCESS;
823	}
824
825
826	/**
827	* Recodes a Latin-1 string as UTF-8.
828	*
829	* @returns iprt status code.
830	* @param psz The Latin-1 string.
831	* @param cchIn The number of characters to process from psz. The recoding
832	* will stop when cch or '\\0' is reached.
833	* @param psz Where to store the UTF-8 string.
834	* @param cch The size of the UTF-8 buffer, excluding the terminator.
835	*/
836	static int rtLatin1RecodeAsUtf8(const char pszIn, size_t cchIn, char psz, size_t cch)
837	{
838	int rc = VINF_SUCCESS;
839	for (;;)
840	{
841	RTUNICP Cp;
842	size_t cchCp;
843	rc = RTLatin1GetCpNEx(&pszIn, &cchIn, &Cp);
844	if (Cp == 0 \|\| RT_FAILURE(rc))
845	break;
846	cchCp = RTStrCpSize(Cp);
847	if (RT_UNLIKELY(cch < cchCp))
848	{
849	RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
850	rc = VERR_BUFFER_OVERFLOW;
851	break;
852	}
853	cch -= cchCp;
854	psz = RTStrPutCp(psz, Cp);
855	}
856
857	/* done */
858	if (rc == VERR_END_OF_STRING)
859	rc = VINF_SUCCESS;
860	*psz = '\0';
861	return rc;
862	}
863
864
865
866	RTDECL(int) RTLatin1ToUtf8Tag(const char pszString, char ppszString, const char pszTag)
867	{
868	/*
869	* Validate input.
870	*/
871	Assert(VALID_PTR(ppszString));
872	Assert(VALID_PTR(pszString));
873	*ppszString = NULL;
874
875	/*
876	* Calculate the length of the UTF-8 encoding of the Latin-1 string.
877	*/
878	size_t cch;
879	int rc = rtLatin1CalcUtf8Length(pszString, RTSTR_MAX, &cch);
880	if (RT_SUCCESS(rc))
881	{
882	/*
883	* Allocate buffer and recode it.
884	*/
885	char pszResult = (char )RTMemAllocTag(cch + 1, pszTag);
886	if (pszResult)
887	{
888	rc = rtLatin1RecodeAsUtf8(pszString, RTSTR_MAX, pszResult, cch);
889	if (RT_SUCCESS(rc))
890	{
891	*ppszString = pszResult;
892	return rc;
893	}
894
895	RTMemFree(pszResult);
896	}
897	else
898	rc = VERR_NO_STR_MEMORY;
899	}
900	return rc;
901	}
902	RT_EXPORT_SYMBOL(RTLatin1ToUtf8Tag);
903
904
905	RTDECL(int) RTLatin1ToUtf8ExTag(const char pszString, size_t cchString, char ppsz, size_t cch, size_t pcch, const char *pszTag)
906	{
907	/*
908	* Validate input.
909	*/
910	Assert(VALID_PTR(pszString));
911	Assert(VALID_PTR(ppsz));
912	Assert(!pcch \|\| VALID_PTR(pcch));
913
914	/*
915	* Calculate the length of the UTF-8 encoding of the Latin-1 string.
916	*/
917	size_t cchResult;
918	int rc = rtLatin1CalcUtf8Length(pszString, cchString, &cchResult);
919	if (RT_SUCCESS(rc))
920	{
921	if (pcch)
922	*pcch = cchResult;
923
924	/*
925	* Check buffer size / Allocate buffer and recode it.
926	*/
927	bool fShouldFree;
928	char *pszResult;
929	if (cch > 0 && *ppsz)
930	{
931	fShouldFree = false;
932	if (RT_UNLIKELY(cch <= cchResult))
933	return VERR_BUFFER_OVERFLOW;
934	pszResult = *ppsz;
935	}
936	else
937	{
938	*ppsz = NULL;
939	fShouldFree = true;
940	cch = RT_MAX(cch, cchResult + 1);
941	pszResult = (char *)RTStrAllocTag(cch, pszTag);
942	}
943	if (pszResult)
944	{
945	rc = rtLatin1RecodeAsUtf8(pszString, cchString, pszResult, cch - 1);
946	if (RT_SUCCESS(rc))
947	{
948	*ppsz = pszResult;
949	return rc;
950	}
951
952	if (fShouldFree)
953	RTStrFree(pszResult);
954	}
955	else
956	rc = VERR_NO_STR_MEMORY;
957	}
958	return rc;
959	}
960	RT_EXPORT_SYMBOL(RTLatin1ToUtf8ExTag);
961
962
963	RTDECL(size_t) RTLatin1CalcUtf8Len(const char *psz)
964	{
965	size_t cch;
966	int rc = rtLatin1CalcUtf8Length(psz, RTSTR_MAX, &cch);
967	return RT_SUCCESS(rc) ? cch : 0;
968	}
969	RT_EXPORT_SYMBOL(RTLatin1CalcUtf8Len);
970
971
972	RTDECL(int) RTLatin1CalcUtf8LenEx(const char psz, size_t cchIn, size_t pcch)
973	{
974	size_t cch;
975	int rc = rtLatin1CalcUtf8Length(psz, cchIn, &cch);
976	if (pcch)
977	*pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
978	return rc;
979	}
980	RT_EXPORT_SYMBOL(RTLatin1CalcUtf8LenEx);
981
982
983	/**
984	* Calculates the Latin-1 length of a string, validating the encoding while
985	* doing so.
986	*
987	* @returns IPRT status code.
988	* @param psz Pointer to the UTF-8 string.
989	* @param cchIn The max length of the string. (btw cch = cb)
990	* Use RTSTR_MAX if all of the string is to be examined.
991	* @param pcch Where to store the length of the Latin-1 string in bytes.
992	*/
993	static int rtUtf8CalcLatin1Length(const char psz, size_t cchIn, size_t pcch)
994	{
995	size_t cch = 0;
996	for (;;)
997	{
998	RTUNICP Cp;
999	size_t cchCp;
1000	int rc = RTStrGetCpNEx(&psz, &cchIn, &Cp);
1001	if (Cp == 0 \|\| rc == VERR_END_OF_STRING)
1002	break;
1003	if (RT_FAILURE(rc))
1004	return rc;
1005	cchCp = RTLatin1CpSize(Cp);
1006	if (cchCp == 0)
1007	return VERR_NO_TRANSLATION;
1008	cch += cchCp;
1009	}
1010
1011	/* done */
1012	*pcch = cch;
1013	return VINF_SUCCESS;
1014	}
1015
1016
1017	/**
1018	* Recodes a valid UTF-8 string as Latin-1.
1019	*
1020	* Since we know the input is valid, we do not perform encoding or length checks.
1021	*
1022	* @returns iprt status code.
1023	* @param pszIn The UTF-8 string to recode. This is a valid encoding.
1024	* @param cchIn The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
1025	* The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
1026	* @param psz Where to store the Latin-1 string.
1027	* @param cch The number of characters the pszOut buffer can hold, excluding the terminator ('\\0').
1028	*/
1029	static int rtUtf8RecodeAsLatin1(const char pszIn, size_t cchIn, char psz, size_t cch)
1030	{
1031	int rc = VINF_SUCCESS;
1032
1033	for (;;)
1034	{
1035	RTUNICP Cp;
1036	size_t cchCp;
1037	rc = RTStrGetCpNEx(&pszIn, &cchIn, &Cp);
1038	if (Cp == 0 \|\| RT_FAILURE(rc))
1039	break;
1040	cchCp = RTLatin1CpSize(Cp);
1041	if (RT_UNLIKELY(cch < cchCp))
1042	{
1043	RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
1044	rc = VERR_BUFFER_OVERFLOW;
1045	break;
1046	}
1047	cch -= cchCp;
1048	psz = RTLatin1PutCp(psz, Cp);
1049	}
1050
1051	/* done */
1052	if (rc == VERR_END_OF_STRING)
1053	rc = VINF_SUCCESS;
1054	*psz = '\0';
1055	return rc;
1056	}
1057
1058
1059
1060	RTDECL(int) RTStrToLatin1Tag(const char pszString, char ppszString, const char pszTag)
1061	{
1062	/*
1063	* Validate input.
1064	*/
1065	Assert(VALID_PTR(ppszString));
1066	Assert(VALID_PTR(pszString));
1067	*ppszString = NULL;
1068
1069	/*
1070	* Validate the UTF-8 input and calculate the length of the Latin-1 string.
1071	*/
1072	size_t cch;
1073	int rc = rtUtf8CalcLatin1Length(pszString, RTSTR_MAX, &cch);
1074	if (RT_SUCCESS(rc))
1075	{
1076	/*
1077	* Allocate buffer.
1078	*/
1079	char psz = (char )RTMemAllocTag(cch + 1, pszTag);
1080	if (psz)
1081	{
1082	/*
1083	* Encode the UTF-16 string.
1084	*/
1085	rc = rtUtf8RecodeAsLatin1(pszString, RTSTR_MAX, psz, cch);
1086	if (RT_SUCCESS(rc))
1087	{
1088	*ppszString = psz;
1089	return rc;
1090	}
1091	RTMemFree(psz);
1092	}
1093	else
1094	rc = VERR_NO_STR_MEMORY;
1095	}
1096	return rc;
1097	}
1098	RT_EXPORT_SYMBOL(RTStrToLatin1Tag);
1099
1100
1101	RTDECL(int) RTStrToLatin1ExTag(const char *pszString, size_t cchString,
1102	char *ppsz, size_t cch, size_t pcch, const char *pszTag)
1103	{
1104	/*
1105	* Validate input.
1106	*/
1107	Assert(VALID_PTR(pszString));
1108	Assert(VALID_PTR(ppsz));
1109	Assert(!pcch \|\| VALID_PTR(pcch));
1110
1111	/*
1112	* Validate the UTF-8 input and calculate the length of the UTF-16 string.
1113	*/
1114	size_t cchResult;
1115	int rc = rtUtf8CalcLatin1Length(pszString, cchString, &cchResult);
1116	if (RT_SUCCESS(rc))
1117	{
1118	if (pcch)
1119	*pcch = cchResult;
1120
1121	/*
1122	* Check buffer size / Allocate buffer.
1123	*/
1124	bool fShouldFree;
1125	char *pszResult;
1126	if (cch > 0 && *ppsz)
1127	{
1128	fShouldFree = false;
1129	if (cch <= cchResult)
1130	return VERR_BUFFER_OVERFLOW;
1131	pszResult = *ppsz;
1132	}
1133	else
1134	{
1135	*ppsz = NULL;
1136	fShouldFree = true;
1137	cch = RT_MAX(cchResult + 1, cch);
1138	pszResult = (char *)RTMemAllocTag(cch, pszTag);
1139	}
1140	if (pszResult)
1141	{
1142	/*
1143	* Encode the Latin-1 string.
1144	*/
1145	rc = rtUtf8RecodeAsLatin1(pszString, cchString, pszResult, cch - 1);
1146	if (RT_SUCCESS(rc))
1147	{
1148	*ppsz = pszResult;
1149	return rc;
1150	}
1151	if (fShouldFree)
1152	RTMemFree(pszResult);
1153	}
1154	else
1155	rc = VERR_NO_STR_MEMORY;
1156	}
1157	return rc;
1158	}
1159	RT_EXPORT_SYMBOL(RTStrToLatin1Tag);
1160
1161
1162	RTDECL(size_t) RTStrCalcLatin1Len(const char *psz)
1163	{
1164	size_t cch;
1165	int rc = rtUtf8CalcLatin1Length(psz, RTSTR_MAX, &cch);
1166	return RT_SUCCESS(rc) ? cch : 0;
1167	}
1168	RT_EXPORT_SYMBOL(RTStrCalcLatin1Len);
1169
1170
1171	RTDECL(int) RTStrCalcLatin1LenEx(const char psz, size_t cchIn, size_t pcch)
1172	{
1173	size_t cch;
1174	int rc = rtUtf8CalcLatin1Length(psz, cchIn, &cch);
1175	if (pcch)
1176	*pcch = RT_SUCCESS(rc) ? cch : ~(size_t)0;
1177	return rc;
1178	}
1179	RT_EXPORT_SYMBOL(RTStrCalcLatin1LenEx);
1180
1181
1182	/**
1183	* Handle invalid encodings passed to RTStrGetCp() and RTStrGetCpEx().
1184	* @returns rc
1185	* @param ppsz The pointer to the string position point.
1186	* @param pCp Where to store RTUNICP_INVALID.
1187	* @param rc The iprt error code.
1188	*/
1189	static int rtStrGetCpExFailure(const char **ppsz, PRTUNICP pCp, int rc)
1190	{
1191	/*
1192	* Try find a valid encoding.
1193	*/
1194	(ppsz)++; /* @todo code this! */
1195	*pCp = RTUNICP_INVALID;
1196	return rc;
1197	}
1198
1199
1200	RTDECL(RTUNICP) RTStrGetCpInternal(const char *psz)
1201	{
1202	RTUNICP Cp;
1203	RTStrGetCpExInternal(&psz, &Cp);
1204	return Cp;
1205	}
1206	RT_EXPORT_SYMBOL(RTStrGetCpInternal);
1207
1208
1209	RTDECL(int) RTStrGetCpExInternal(const char **ppsz, PRTUNICP pCp)
1210	{
1211	const unsigned char puch = (const unsigned char )*ppsz;
1212	const unsigned char uch = *puch;
1213	RTUNICP uc;
1214
1215	/* ASCII ? */
1216	if (!(uch & RT_BIT(7)))
1217	{
1218	uc = uch;
1219	puch++;
1220	}
1221	else if (uch & RT_BIT(6))
1222	{
1223	/* figure the length and validate the first octet. */
1224	/** @todo RT_USE_RTC_3629 */
1225	unsigned cb;
1226	if (!(uch & RT_BIT(5)))
1227	cb = 2;
1228	else if (!(uch & RT_BIT(4)))
1229	cb = 3;
1230	else if (!(uch & RT_BIT(3)))
1231	cb = 4;
1232	else if (!(uch & RT_BIT(2)))
1233	cb = 5;
1234	else if (!(uch & RT_BIT(1)))
1235	cb = 6;
1236	else
1237	{
1238	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.Rhxs\n", RT_MIN(strlen((char )puch), 10), puch));
1239	return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
1240	}
1241
1242	/* validate the rest */
1243	switch (cb)
1244	{
1245	case 6:
1246	RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1247	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1248	case 5:
1249	RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1250	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1251	case 4:
1252	RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1253	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1254	case 3:
1255	RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1256	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1257	case 2:
1258	RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1259	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1260	break;
1261	}
1262
1263	/* get and validate the code point. */
1264	switch (cb)
1265	{
1266	case 6:
1267	uc = (puch[5] & 0x3f)
1268	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
1269	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
1270	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
1271	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
1272	\| ((RTUNICP)(uch & 0x01) << 30);
1273	RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
1274	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1275	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1276	break;
1277	case 5:
1278	uc = (puch[4] & 0x3f)
1279	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
1280	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
1281	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
1282	\| ((RTUNICP)(uch & 0x03) << 24);
1283	RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
1284	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1285	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1286	break;
1287	case 4:
1288	uc = (puch[3] & 0x3f)
1289	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
1290	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
1291	\| ((RTUNICP)(uch & 0x07) << 18);
1292	RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
1293	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1294	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1295	break;
1296	case 3:
1297	uc = (puch[2] & 0x3f)
1298	\| ((RTUNICP)(puch[1] & 0x3f) << 6)
1299	\| ((RTUNICP)(uch & 0x0f) << 12);
1300	RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
1301	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1302	rtStrGetCpExFailure(ppsz, pCp, uc == 0xffff \|\| uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING));
1303	RTStrAssertMsgReturn(uc < 0xd800 \|\| uc > 0xdfff,
1304	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1305	rtStrGetCpExFailure(ppsz, pCp, VERR_CODE_POINT_SURROGATE));
1306	break;
1307	case 2:
1308	uc = (puch[1] & 0x3f)
1309	\| ((RTUNICP)(uch & 0x1f) << 6);
1310	RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
1311	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1312	rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING));
1313	break;
1314	default: /* impossible, but GCC is bitching. */
1315	uc = RTUNICP_INVALID;
1316	break;
1317	}
1318	puch += cb;
1319	}
1320	else
1321	{
1322	/* 6th bit is always set. */
1323	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.Rhxs\n", RT_MIN(strlen((char )puch), 10), puch));
1324	return rtStrGetCpExFailure(ppsz, pCp, VERR_INVALID_UTF8_ENCODING);
1325	}
1326	*pCp = uc;
1327	ppsz = (const char )puch;
1328	return VINF_SUCCESS;
1329	}
1330	RT_EXPORT_SYMBOL(RTStrGetCpExInternal);
1331
1332
1333	/**
1334	* Handle invalid encodings passed to RTStrGetCpNEx().
1335	* @returns rc
1336	* @param ppsz The pointer to the string position point.
1337	* @param pcch Pointer to the string length.
1338	* @param pCp Where to store RTUNICP_INVALID.
1339	* @param rc The iprt error code.
1340	*/
1341	static int rtStrGetCpNExFailure(const char *ppsz, size_t pcch, PRTUNICP pCp, int rc)
1342	{
1343	/*
1344	* Try find a valid encoding.
1345	*/
1346	(ppsz)++; /* @todo code this! */
1347	(*pcch)--;
1348	*pCp = RTUNICP_INVALID;
1349	return rc;
1350	}
1351
1352
1353	RTDECL(int) RTStrGetCpNExInternal(const char *ppsz, size_t pcch, PRTUNICP pCp)
1354	{
1355	const unsigned char puch = (const unsigned char )*ppsz;
1356	const unsigned char uch = *puch;
1357	size_t cch = *pcch;
1358	RTUNICP uc;
1359
1360	if (cch == 0)
1361	{
1362	*pCp = RTUNICP_INVALID;
1363	return VERR_END_OF_STRING;
1364	}
1365
1366	/* ASCII ? */
1367	if (!(uch & RT_BIT(7)))
1368	{
1369	uc = uch;
1370	puch++;
1371	cch--;
1372	}
1373	else if (uch & RT_BIT(6))
1374	{
1375	/* figure the length and validate the first octet. */
1376	/** @todo RT_USE_RTC_3629 */
1377	unsigned cb;
1378	if (!(uch & RT_BIT(5)))
1379	cb = 2;
1380	else if (!(uch & RT_BIT(4)))
1381	cb = 3;
1382	else if (!(uch & RT_BIT(3)))
1383	cb = 4;
1384	else if (!(uch & RT_BIT(2)))
1385	cb = 5;
1386	else if (!(uch & RT_BIT(1)))
1387	cb = 6;
1388	else
1389	{
1390	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.Rhxs\n", RT_MIN(strlen((char )puch), 10), puch));
1391	return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1392	}
1393
1394	if (cb > cch)
1395	return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1396
1397	/* validate the rest */
1398	switch (cb)
1399	{
1400	case 6:
1401	RTStrAssertMsgReturn((puch[5] & 0xc0) == 0x80, ("6/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1402	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1403	case 5:
1404	RTStrAssertMsgReturn((puch[4] & 0xc0) == 0x80, ("5/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1405	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1406	case 4:
1407	RTStrAssertMsgReturn((puch[3] & 0xc0) == 0x80, ("4/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1408	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1409	case 3:
1410	RTStrAssertMsgReturn((puch[2] & 0xc0) == 0x80, ("3/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1411	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1412	case 2:
1413	RTStrAssertMsgReturn((puch[1] & 0xc0) == 0x80, ("2/%u: %.Rhxs\n", cb, RT_MIN(cb + 10, strlen((char )puch)), puch),
1414	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1415	break;
1416	}
1417
1418	/* get and validate the code point. */
1419	switch (cb)
1420	{
1421	case 6:
1422	uc = (puch[5] & 0x3f)
1423	\| ((RTUNICP)(puch[4] & 0x3f) << 6)
1424	\| ((RTUNICP)(puch[3] & 0x3f) << 12)
1425	\| ((RTUNICP)(puch[2] & 0x3f) << 18)
1426	\| ((RTUNICP)(puch[1] & 0x3f) << 24)
1427	\| ((RTUNICP)(uch & 0x01) << 30);
1428	RTStrAssertMsgReturn(uc >= 0x04000000 && uc <= 0x7fffffff,
1429	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1430	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1431	break;
1432	case 5:
1433	uc = (puch[4] & 0x3f)
1434	\| ((RTUNICP)(puch[3] & 0x3f) << 6)
1435	\| ((RTUNICP)(puch[2] & 0x3f) << 12)
1436	\| ((RTUNICP)(puch[1] & 0x3f) << 18)
1437	\| ((RTUNICP)(uch & 0x03) << 24);
1438	RTStrAssertMsgReturn(uc >= 0x00200000 && uc <= 0x03ffffff,
1439	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1440	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1441	break;
1442	case 4:
1443	uc = (puch[3] & 0x3f)
1444	\| ((RTUNICP)(puch[2] & 0x3f) << 6)
1445	\| ((RTUNICP)(puch[1] & 0x3f) << 12)
1446	\| ((RTUNICP)(uch & 0x07) << 18);
1447	RTStrAssertMsgReturn(uc >= 0x00010000 && uc <= 0x001fffff,
1448	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1449	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1450	break;
1451	case 3:
1452	uc = (puch[2] & 0x3f)
1453	\| ((RTUNICP)(puch[1] & 0x3f) << 6)
1454	\| ((RTUNICP)(uch & 0x0f) << 12);
1455	RTStrAssertMsgReturn(uc >= 0x00000800 && uc <= 0x0000fffd,
1456	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1457	rtStrGetCpNExFailure(ppsz, pcch, pCp, uc == 0xffff \|\| uc == 0xfffe ? VERR_CODE_POINT_ENDIAN_INDICATOR : VERR_INVALID_UTF8_ENCODING));
1458	RTStrAssertMsgReturn(uc < 0xd800 \|\| uc > 0xdfff,
1459	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1460	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_CODE_POINT_SURROGATE));
1461	break;
1462	case 2:
1463	uc = (puch[1] & 0x3f)
1464	\| ((RTUNICP)(uch & 0x1f) << 6);
1465	RTStrAssertMsgReturn(uc >= 0x00000080 && uc <= 0x000007ff,
1466	("%u: cp=%#010RX32: %.Rhxs\n", cb, uc, RT_MIN(cb + 10, strlen((char )puch)), puch),
1467	rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING));
1468	break;
1469	default: /* impossible, but GCC is bitching. */
1470	uc = RTUNICP_INVALID;
1471	break;
1472	}
1473	puch += cb;
1474	cch -= cb;
1475	}
1476	else
1477	{
1478	/* 6th bit is always set. */
1479	RTStrAssertMsgFailed(("Invalid UTF-8 first byte: %.Rhxs\n", RT_MIN(strlen((char )puch), 10), puch));
1480	return rtStrGetCpNExFailure(ppsz, pcch, pCp, VERR_INVALID_UTF8_ENCODING);
1481	}
1482	*pCp = uc;
1483	ppsz = (const char )puch;
1484	(*pcch) = cch;
1485	return VINF_SUCCESS;
1486	}
1487	RT_EXPORT_SYMBOL(RTStrGetCpNExInternal);
1488
1489
1490	RTDECL(char ) RTStrPutCpInternal(char psz, RTUNICP uc)
1491	{
1492	unsigned char puch = (unsigned char )psz;
1493	if (uc < 0x80)
1494	*puch++ = (unsigned char )uc;
1495	else if (uc < 0x00000800)
1496	{
1497	*puch++ = 0xc0 \| (uc >> 6);
1498	*puch++ = 0x80 \| (uc & 0x3f);
1499	}
1500	else if (uc < 0x00010000)
1501	{
1502	/** @todo RT_USE_RTC_3629 */
1503	if ( uc < 0x0000d8000
1504	\|\| ( uc > 0x0000dfff
1505	&& uc < 0x0000fffe))
1506	{
1507	*puch++ = 0xe0 \| (uc >> 12);
1508	*puch++ = 0x80 \| ((uc >> 6) & 0x3f);
1509	*puch++ = 0x80 \| (uc & 0x3f);
1510	}
1511	else
1512	{
1513	AssertMsgFailed(("Invalid code point U+%05x!\n", uc));
1514	*puch++ = 0x7f;
1515	}
1516	}
1517	/** @todo RT_USE_RTC_3629 */
1518	else if (uc < 0x00200000)
1519	{
1520	*puch++ = 0xf0 \| (uc >> 18);
1521	*puch++ = 0x80 \| ((uc >> 12) & 0x3f);
1522	*puch++ = 0x80 \| ((uc >> 6) & 0x3f);
1523	*puch++ = 0x80 \| (uc & 0x3f);
1524	}
1525	else if (uc < 0x04000000)
1526	{
1527	*puch++ = 0xf8 \| (uc >> 24);
1528	*puch++ = 0x80 \| ((uc >> 18) & 0x3f);
1529	*puch++ = 0x80 \| ((uc >> 12) & 0x3f);
1530	*puch++ = 0x80 \| ((uc >> 6) & 0x3f);
1531	*puch++ = 0x80 \| (uc & 0x3f);
1532	}
1533	else if (uc <= 0x7fffffff)
1534	{
1535	*puch++ = 0xfc \| (uc >> 30);
1536	*puch++ = 0x80 \| ((uc >> 24) & 0x3f);
1537	*puch++ = 0x80 \| ((uc >> 18) & 0x3f);
1538	*puch++ = 0x80 \| ((uc >> 12) & 0x3f);
1539	*puch++ = 0x80 \| ((uc >> 6) & 0x3f);
1540	*puch++ = 0x80 \| (uc & 0x3f);
1541	}
1542	else
1543	{
1544	AssertMsgFailed(("Invalid code point U+%08x!\n", uc));
1545	*puch++ = 0x7f;
1546	}
1547
1548	return (char *)puch;
1549	}
1550	RT_EXPORT_SYMBOL(RTStrPutCpInternal);
1551
1552
1553	RTDECL(char ) RTStrPrevCp(const char pszStart, const char *psz)
1554	{
1555	if (pszStart < psz)
1556	{
1557	/* simple char? */
1558	const unsigned char puch = (const unsigned char )psz;
1559	unsigned uch = *--puch;
1560	if (!(uch & RT_BIT(7)))
1561	return (char *)puch;
1562	RTStrAssertMsgReturn(!(uch & RT_BIT(6)), ("uch=%#x\n", uch), (char *)pszStart);
1563
1564	/* two or more. */
1565	uint32_t uMask = 0xffffffc0;
1566	while ( (const unsigned char *)pszStart < puch
1567	&& !(uMask & 1))
1568	{
1569	uch = *--puch;
1570	if ((uch & 0xc0) != 0x80)
1571	{
1572	RTStrAssertMsgReturn((uch & (uMask >> 1)) == (uMask & 0xff),
1573	("Invalid UTF-8 encoding: %.Rhxs puch=%p psz=%p\n", psz - (char )puch, puch, psz),
1574	(char *)pszStart);
1575	return (char *)puch;
1576	}
1577	uMask >>= 1;
1578	}
1579	RTStrAssertMsgFailed(("Invalid UTF-8 encoding: %.Rhxs puch=%p psz=%p\n", psz - (char )puch, puch, psz));
1580	}
1581	return (char *)pszStart;
1582	}
1583	RT_EXPORT_SYMBOL(RTStrPrevCp);
1584

Note: See TracBrowser for help on using the repository browser.

source: vbox/trunk/src/VBox/Runtime/common/string/utf-8.cpp@ 31418

Download in other formats: