Index: /trunk/include/iprt/cpp/ministring.h
===================================================================
--- /trunk/include/iprt/cpp/ministring.h	(revision 35566)
+++ /trunk/include/iprt/cpp/ministring.h	(revision 35567)
@@ -185,7 +185,7 @@
      * String length in bytes.
      *
-     * Returns the length of the member string, which is equal to strlen(c_str()).
-     * In other words, this does not count unicode codepoints but returns the number
-     * of bytes.  This is always cached so calling this is cheap and requires no
+     * Returns the length of the member string in bytes, which is equal to strlen(c_str()).
+     * In other words, this does not count unicode codepoints; use utf8length() for that.
+     * The byte length is always cached so calling this is cheap and requires no
      * strlen() invocation.
      *
@@ -195,4 +195,17 @@
     {
         return m_cch;
+    }
+
+    /**
+     * String length in UTF-8 codepoints.
+     *
+     * As opposed to length(), which returns the length in bytes, this counts the number
+     * of UTF-8 codepoints. This is *not* cached so calling this is expensive.
+     *
+     * @returns Number of codepoints in the member string.
+     */
+    size_t utf8length() const
+    {
+        return m_psz ? RTStrUniLen(m_psz) : 0;
     }
 
@@ -652,6 +665,6 @@
      * Find the given substring.
      *
-     * Looks for pcszFind in "this" starting at "pos" and returns its position,
-     * counting from the beginning of "this" at 0.
+     * Looks for pcszFind in "this" starting at "pos" and returns its position
+     * as a byte (not codepoint) offset, counting from the beginning of "this" at 0.
      *
      * @param   pcszFind        The substring to find.
@@ -676,7 +689,23 @@
      * Returns a substring of "this" as a new Utf8Str.
      *
-     * Works exactly like its equivalent in std::string except that this interprets
-     * pos and n as unicode codepoints instead of bytes.  With the default
-     * parameters "0" and "npos", this always copies the entire string.
+     * Works exactly like its equivalent in std::string. With the default
+     * parameters "0" and "npos", this always copies the entire string. The
+     * "pos" and "n" arguments represent bytes; it is the caller's responsibility
+     * to ensure that the offsets do not copy invalid UTF-8 sequences. When
+     * used in conjunction with find() and length(), this will work.
+     *
+     * @param   pos             Index of first byte offset to copy from "this", counting from 0.
+     * @param   n               Number of bytes to copy, starting with the one at "pos".
+     *                          The copying will stop if the null terminator is encountered before
+     *                          n bytes have been copied.
+     */
+    iprt::MiniString substr(size_t pos = 0, size_t n = npos) const
+    {
+        return MiniString(*this, pos, n);
+    }
+
+    /**
+     * Returns a substring of "this" as a new Utf8Str. As opposed to substr(),
+     * this variant takes codepoint offsets instead of byte offsets.
      *
      * @param   pos             Index of first unicode codepoint to copy from
@@ -686,8 +715,6 @@
      *                          terminator is encountered before n codepoints have
      *                          been copied.
-     *
-     * @remarks This works on code points, not bytes!
-     */
-    iprt::MiniString substr(size_t pos = 0, size_t n = npos) const;
+     */
+    iprt::MiniString substrCP(size_t pos = 0, size_t n = npos) const;
 
     /**
Index: /trunk/src/VBox/Runtime/common/string/ministring.cpp
===================================================================
--- /trunk/src/VBox/Runtime/common/string/ministring.cpp	(revision 35566)
+++ /trunk/src/VBox/Runtime/common/string/ministring.cpp	(revision 35567)
@@ -224,5 +224,5 @@
 }
 
-MiniString MiniString::substr(size_t pos /*= 0*/, size_t n /*= npos*/)
+MiniString MiniString::substrCP(size_t pos /*= 0*/, size_t n /*= npos*/)
     const
 {
@@ -256,11 +256,14 @@
 
                 size_t cbCopy = psz - pFirst;
-                ret.reserve(cbCopy + 1); // may throw bad_alloc
-#ifndef RT_EXCEPTIONS_ENABLED
-                AssertRelease(capacity() >= cbCopy + 1);
-#endif
-                memcpy(ret.m_psz, pFirst, cbCopy);
-                ret.m_cch = cbCopy;
-                ret.m_psz[cbCopy] = '\0';
+                if (cbCopy)
+                {
+                    ret.reserve(cbCopy + 1); // may throw bad_alloc
+#ifndef RT_EXCEPTIONS_ENABLED
+                    AssertRelease(capacity() >= cbCopy + 1);
+#endif
+                    memcpy(ret.m_psz, pFirst, cbCopy);
+                    ret.m_cch = cbCopy;
+                    ret.m_psz[cbCopy] = '\0';
+                }
             }
         }
Index: /trunk/src/VBox/Runtime/testcase/tstIprtMiniString.cpp
===================================================================
--- /trunk/src/VBox/Runtime/testcase/tstIprtMiniString.cpp	(revision 35566)
+++ /trunk/src/VBox/Runtime/testcase/tstIprtMiniString.cpp	(revision 35567)
@@ -211,4 +211,44 @@
     CHECK_EQUAL(SubStr15, "cdef");
 
+    /* substr() and substrCP() functions */
+    iprt::MiniString strTest("");
+    CHECK_EQUAL(strTest.substr(0), "");
+    CHECK_EQUAL(strTest.substrCP(0), "");
+    CHECK_EQUAL(strTest.substr(1), "");
+    CHECK_EQUAL(strTest.substrCP(1), "");
+
+    /* now let's have some non-ASCII to chew on */
+    strTest = "abcdefßäbcdef";
+            // 13 codepoints, but 15 bytes (excluding null terminator);
+            // "ß" and "ä" consume two bytes each
+    CHECK_EQUAL(strTest.substr(0),   strTest.c_str());
+    CHECK_EQUAL(strTest.substrCP(0), strTest.c_str());
+
+    CHECK_EQUAL(strTest.substr(2),   "cdefßäbcdef");
+    CHECK_EQUAL(strTest.substrCP(2), "cdefßäbcdef");
+
+    CHECK_EQUAL(strTest.substr(2, 2),   "cd");
+    CHECK_EQUAL(strTest.substrCP(2, 2), "cd");
+
+    CHECK_EQUAL(strTest.substr(6),   "ßäbcdef");
+    CHECK_EQUAL(strTest.substrCP(6), "ßäbcdef");
+
+    CHECK_EQUAL(strTest.substr(6, 2),   "ß");           // UTF-8 "ß" consumes two bytes
+    CHECK_EQUAL(strTest.substrCP(6, 1), "ß");
+
+    CHECK_EQUAL(strTest.substr(8),   "äbcdef");         // UTF-8 "ß" consumes two bytes
+    CHECK_EQUAL(strTest.substrCP(7), "äbcdef");
+
+    CHECK_EQUAL(strTest.substr(8, 3),   "äb");          // UTF-8 "ä" consumes two bytes
+    CHECK_EQUAL(strTest.substrCP(7, 2), "äb");
+
+    CHECK_EQUAL(strTest.substr(14, 1),   "f");
+    CHECK_EQUAL(strTest.substrCP(12, 1), "f");
+
+    CHECK_EQUAL(strTest.substr(15, 1),   "");
+    CHECK_EQUAL(strTest.substrCP(13, 1), "");
+
+    CHECK_EQUAL(strTest.substr(16, 1),   "");
+    CHECK_EQUAL(strTest.substrCP(15, 1), "");
 
     /* special constructor and assignment arguments */
