Index: /trunk/include/iprt/string.h
===================================================================
--- /trunk/include/iprt/string.h	(revision 31245)
+++ /trunk/include/iprt/string.h	(revision 31246)
@@ -29,5 +29,7 @@
 #include <iprt/cdefs.h>
 #include <iprt/types.h>
+#include <iprt/assert.h>
 #include <iprt/stdarg.h>
+#include <iprt/uni.h> /* for RTUNICP_INVALID */
 #include <iprt/err.h> /* for VINF_SUCCESS */
 #if defined(RT_OS_LINUX) && defined(__KERNEL__)
@@ -1157,4 +1159,24 @@
 
 /**
+ * Get the UTF-8 size in characters of a given Unicode code point.  The code
+ * point is expected to be a valid Unicode one, but not necessarily in the
+ * range supported by UTF-8.
+ *
+ * @returns the size in characters, or zero if there is no UTF-8 encoding
+ */
+DECLINLINE(size_t) RTStrCpSize(RTUNICP CodePoint)
+{
+    if (CodePoint < 0x80)
+        return 1;
+    if (CodePoint < 0x800)
+        return 2;
+    if (CodePoint < 0x10000)
+        return 3;
+    if (CodePoint < 0x11000)
+        return 4;
+    return 0;
+}
+
+/**
  * Put the unicode code point at the given string position
  * and return the pointer to the char following it.
@@ -1207,4 +1229,128 @@
  */
 RTDECL(char *) RTStrPrevCp(const char *pszStart, const char *psz);
+
+/**
+ * Get the unicode code point at the given string position.
+ *
+ * @returns unicode code point.
+ * @returns RTUNICP_INVALID if the encoding is invalid.
+ * @param   psz         The string.
+ */
+DECLINLINE(RTUNICP) RTLatin1GetCp(const char *psz)
+{
+    return *(const unsigned char *)psz;
+}
+
+/**
+ * Get the unicode code point at the given string position.
+ *
+ * @returns iprt status code.
+ * @param   ppsz        Pointer to the string pointer. This will be updated to
+ *                      point to the char following the current code point.
+ *                      This is advanced one character forward on failure.
+ * @param   pCp         Where to store the code point.
+ *                      RTUNICP_INVALID is stored here on failure.
+ *
+ * @remark  We optimize this operation by using an inline function for
+ *          the most frequent and simplest sequence, the rest is
+ *          handled by RTStrGetCpExInternal().
+ */
+DECLINLINE(int) RTLatin1GetCpEx(const char **ppsz, PRTUNICP pCp)
+{
+    const unsigned char uch = **(const unsigned char **)ppsz;
+    (*ppsz)++;
+    *pCp = uch;
+    return VINF_SUCCESS;
+}
+
+/**
+ * Get the unicode code point at the given string position for a string of a
+ * given maximum length.
+ *
+ * @returns iprt status code.
+ * @retval  VERR_END_OF_STRING if *pcch is 0. *pCp is set to RTUNICP_INVALID.
+ *
+ * @param   ppsz        Pointer to the string pointer. This will be updated to
+ *                      point to the char following the current code point.
+ * @param   pcch        Pointer to the maximum string length.  This will be
+ *                      decremented by the size of the code point found.
+ * @param   pCp         Where to store the code point.
+ *                      RTUNICP_INVALID is stored here on failure.
+ */
+DECLINLINE(int) RTLatin1GetCpNEx(const char **ppsz, size_t *pcch, PRTUNICP pCp)
+{
+    if (RT_LIKELY(*pcch != 0))
+    {
+        const unsigned char uch = **(const unsigned char **)ppsz;
+        (*ppsz)++;
+        (*pcch)--;
+        *pCp = uch;
+        return VINF_SUCCESS;
+    }
+    *pCp = RTUNICP_INVALID;
+    return VERR_END_OF_STRING;
+}
+
+/**
+ * Get the Latin-1 size in characters of a given Unicode code point.  The code
+ * point is expected to be a valid Unicode one, but not necessarily in the
+ * range supported by Latin-1.
+ *
+ * @returns the size in characters, or zero if there is no Latin-1 encoding
+ */
+DECLINLINE(size_t) RTLatin1CpSize(RTUNICP CodePoint)
+{
+    if (CodePoint < 0x100)
+        return 1;
+    return 0;
+}
+
+/**
+ * Put the unicode code point at the given string position
+ * and return the pointer to the char following it.
+ *
+ * This function will not consider anything at or following the
+ * buffer area pointed to by psz. It is therefore not suitable for
+ * inserting code points into a string, only appending/overwriting.
+ *
+ * @returns pointer to the char following the written code point.
+ * @param   psz         The string.
+ * @param   CodePoint   The code point to write.
+ *                      This should not be RTUNICP_INVALID or any other
+ *                      character out of the Latin-1 range.
+ */
+DECLINLINE(char *) RTLatin1PutCp(char *psz, RTUNICP CodePoint)
+{
+    AssertReturn(CodePoint < 0x100, NULL);
+    *psz++ = (unsigned char)CodePoint;
+    return psz;
+}
+
+/**
+ * Skips ahead, past the current code point.
+ *
+ * @returns Pointer to the char after the current code point.
+ * @param   psz     Pointer to the current code point.
+ * @remark  This will not move the next valid code point, only past the current one.
+ */
+DECLINLINE(char *) RTLatin1NextCp(const char *psz)
+{
+    psz++;
+    return (char *)psz;
+}
+
+/**
+ * Skips back to the previous code point.
+ *
+ * @returns Pointer to the char before the current code point.
+ * @returns pszStart on failure.
+ * @param   pszStart    Pointer to the start of the string.
+ * @param   psz         Pointer to the current code point.
+ */
+DECLINLINE(char *) RTLatin1PrevCp(const char *psz)
+{
+    psz--;
+    return (char *)psz;
+}
 
 
Index: /trunk/src/VBox/Runtime/common/string/utf-8.cpp
===================================================================
--- /trunk/src/VBox/Runtime/common/string/utf-8.cpp	(revision 31245)
+++ /trunk/src/VBox/Runtime/common/string/utf-8.cpp	(revision 31246)
@@ -805,15 +805,18 @@
 {
     size_t  cch = 0;
-    while (cchIn > 0)
-    {
-        char ch = *psz++; cchIn--;
-        if (!ch)
+    while (true)
+    {
+        RTUNICP Cp;
+        size_t cchCp;
+        int rc = RTLatin1GetCpNEx(&psz, &cchIn, &Cp);
+        if (Cp == 0 || rc == VERR_END_OF_STRING)
             break;
-        if (!(ch & 0x80))
-            cch++;
-        else
-            cch += 2;
-    }
-
+        if (RT_FAILURE(rc))
+            return rc;
+        cchCp = RTStrCpSize(Cp);
+        if (cchCp == 0)
+            return VERR_NO_TRANSLATION;
+        cch += cchCp;
+    }
 
     /* done */
@@ -832,43 +835,31 @@
  * @param   psz         Where to store the UTF-8 string.
  * @param   cch         The size of the UTF-8 buffer, excluding the terminator.
- * @param   pcch        Where to store the number of octets actually encoded.
  */
-static int rtLatin1RecodeAsUtf8(const char *pszIn, size_t cchIn, char *psz, size_t cch, size_t *pcch)
-{
-    unsigned char  *puch = (unsigned char *)psz;
-    int             rc = VINF_SUCCESS;
-    while (cchIn > 0)
-    {
-        unsigned char ch = (unsigned char) *pszIn++; cchIn--;
-        if (!ch)
+static int rtLatin1RecodeAsUtf8(const char *pszIn, size_t cchIn, char *psz, size_t cch)
+{
+    int   rc  = VINF_SUCCESS;
+
+    while (true)
+    {
+        RTUNICP Cp;
+        size_t cchCp;
+        rc = RTLatin1GetCpNEx(&pszIn, &cchIn, &Cp);
+        if (Cp == 0 || RT_FAILURE(rc))
             break;
-        if (!(ch & 0x80))
-        {
-            if (RT_UNLIKELY(cch < 1))
-            {
-                RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
-                rc = VERR_BUFFER_OVERFLOW;
-                break;
-            }
-            cch--;
-            *puch++ = (unsigned char)ch;
-        }
-        else
-        {
-            if (RT_UNLIKELY(cch < 2))
-            {
-                RTStrAssertMsgFailed(("Buffer overflow! 2\n"));
-                rc = VERR_BUFFER_OVERFLOW;
-                break;
-            }
-            cch -= 2;
-            *puch++ = 0xc0 | (ch >> 6);
-            *puch++ = 0x80 | (ch & 0x3f);
-        }
+        cchCp = RTStrCpSize(Cp);
+        if (RT_UNLIKELY(cch < cchCp))
+        {
+            RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
+            rc = VERR_BUFFER_OVERFLOW;
+            break;
+        }
+        psz = RTStrPutCp(psz, Cp);
+        cch -= cchCp;
     }
 
     /* done */
-    *puch = '\0';
-    *pcch = (char *)puch - psz;
+    if (rc == VERR_END_OF_STRING)
+        rc = VINF_SUCCESS;
+    *psz = '\0';
     return rc;
 }
@@ -898,5 +889,5 @@
         if (pszResult)
         {
-            rc = rtLatin1RecodeAsUtf8(pszString, RTSTR_MAX, pszResult, cch, &cch);
+            rc = rtLatin1RecodeAsUtf8(pszString, RTSTR_MAX, pszResult, cch);
             if (RT_SUCCESS(rc))
             {
@@ -955,5 +946,5 @@
         if (pszResult)
         {
-            rc = rtLatin1RecodeAsUtf8(pszString, cchString, pszResult, cch - 1, &cch);
+            rc = rtLatin1RecodeAsUtf8(pszString, cchString, pszResult, cch - 1);
             if (RT_SUCCESS(rc))
             {
@@ -998,26 +989,28 @@
  * @returns IPRT status code.
  * @param   psz     Pointer to the UTF-8 string.
- * @param   cch     The max length of the string. (btw cch = cb)
+ * @param   cchIn   The max length of the string. (btw cch = cb)
  *                  Use RTSTR_MAX if all of the string is to be examined.
  * @param   pcch    Where to store the length of the Latin-1 string in bytes.
  */
-static int rtUtf8CalcLatin1Length(const char *psz, size_t cch, size_t *pcch)
-{
-    size_t cchOut = 0;
+static int rtUtf8CalcLatin1Length(const char *psz, size_t cchIn, size_t *pcch)
+{
+    size_t  cch = 0;
     while (true)
     {
         RTUNICP Cp;
-        int rc = RTStrGetCpNEx(&psz, &cch, &Cp);
+        size_t cchCp;
+        int rc = RTStrGetCpNEx(&psz, &cchIn, &Cp);
         if (Cp == 0 || rc == VERR_END_OF_STRING)
             break;
         if (RT_FAILURE(rc))
             return rc;
-        if (Cp >= 0x100)
+        cchCp = RTLatin1CpSize(Cp);
+        if (cchCp == 0)
             return VERR_NO_TRANSLATION;
-        cchOut++;
+        cch += cchCp;
     }
 
     /* done */
-    *pcch = cchOut;
+    *pcch = cch;
     return VINF_SUCCESS;
 }
@@ -1030,61 +1023,39 @@
  *
  * @returns iprt status code.
- * @param   psz     The UTF-8 string to recode. This is a valid encoding.
- * @param   cch     The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
+ * @param   pszIn   The UTF-8 string to recode. This is a valid encoding.
+ * @param   cchIn   The number of chars (the type char, so bytes if you like) to process of the UTF-8 string.
  *                  The recoding will stop when cch or '\\0' is reached. Pass RTSTR_MAX to process up to '\\0'.
- * @param   pszOut  Where to store the Latin-1 string.
- * @param   cchOut  The number of characters the pszOut buffer can hold, excluding the terminator ('\\0').
+ * @param   psz     Where to store the Latin-1 string.
+ * @param   cch     The number of characters the pszOut buffer can hold, excluding the terminator ('\\0').
  */
-static int rtUtf8RecodeAsLatin1(const char *psz, size_t cch, char *pszOut, size_t cchOut)
-{
-    int                     rc      = VINF_SUCCESS;
-    const unsigned char    *puch    = (const unsigned char *)psz;
-    unsigned char          *puchOut = (unsigned char *)pszOut;
-    while (cch > 0)
-    {
-        /* read the next char and check for terminator. */
-        const unsigned char uch = *puch;
-        if (!uch)
+static int rtUtf8RecodeAsLatin1(const char *pszIn, size_t cchIn, char *psz, size_t cch)
+{
+    int   rc  = VINF_SUCCESS;
+
+    while (true)
+    {
+        RTUNICP Cp;
+        size_t cchCp;
+        rc = RTStrGetCpNEx(&pszIn, &cchIn, &Cp);
+        if (Cp == 0 || RT_FAILURE(rc))
             break;
-
-        /* check for output overflow */
-        if (RT_UNLIKELY(cchOut < 1))
-        {
+        cchCp = RTLatin1CpSize(Cp);
+        if (RT_UNLIKELY(cch < cchCp))
+        {
+            RTStrAssertMsgFailed(("Buffer overflow! 1\n"));
             rc = VERR_BUFFER_OVERFLOW;
             break;
         }
-        cchOut--;
-
-        /* decode and recode the code point */
-        if (!(uch & RT_BIT(7)))
-        {
-            *puchOut++ = uch;
-            puch++;
-            cch--;
-        }
-        else if ((uch & (RT_BIT(7) | RT_BIT(6) | RT_BIT(5))) == (RT_BIT(7) | RT_BIT(6)))
-        {
-            uint16_t uc = (puch[1] & 0x3f)
-                    | ((uint16_t)(uch     & 0x1f) << 6);
-            if (uc >= 0x100)
-            {
-                rc = VERR_NO_TRANSLATION;
-                break;
-            }
-            *puchOut++ = uc;
-            puch += 2;
-            cch -= 2;
-        }
-        else
-        {
-            rc = VERR_NO_TRANSLATION;
-            break;
-        }
+        psz = RTLatin1PutCp(psz, Cp);
+        cch -= cchCp;
     }
 
     /* done */
-    *puchOut = '\0';
-    return rc;
-}
+    if (rc == VERR_END_OF_STRING)
+        rc = VINF_SUCCESS;
+    *psz = '\0';
+    return rc;
+}
+
 
 
