Open Chinese Convert  1.0.3
A project for conversion between Traditional and Simplified Chinese
UTF8Util.hpp
00001 /*
00002  * Open Chinese Convert
00003  *
00004  * Copyright 2013 BYVoid <byvoid@byvoid.com>
00005  *
00006  * Licensed under the Apache License, Version 2.0 (the "License");
00007  * you may not use this file except in compliance with the License.
00008  * You may obtain a copy of the License at
00009  *
00010  *      http://www.apache.org/licenses/LICENSE-2.0
00011  *
00012  * Unless required by applicable law or agreed to in writing, software
00013  * distributed under the License is distributed on an "AS IS" BASIS,
00014  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  * See the License for the specific language governing permissions and
00016  * limitations under the License.
00017  */
00018 
00019 #pragma once
00020 
00021 #include "Common.hpp"
00022 
00023 namespace opencc {
00028 class OPENCC_EXPORT UTF8Util {
00029 public:
00033   static void SkipUtf8Bom(FILE* fp);
00034 
00039   static size_t NextCharLengthNoException(const char* str) {
00040     char ch = *str;
00041     if ((ch & 0xF0) == 0xE0) {
00042       return 3;
00043     } else if ((ch & 0x80) == 0x00) {
00044       return 1;
00045     } else if ((ch & 0xE0) == 0xC0) {
00046       return 2;
00047     } else if ((ch & 0xF8) == 0xF0) {
00048       return 4;
00049     } else if ((ch & 0xFC) == 0xF8) {
00050       return 5;
00051     } else if ((ch & 0xFE) == 0xFC) {
00052       return 6;
00053     }
00054     return 0;
00055   }
00056 
00060   static size_t NextCharLength(const char* str) {
00061     size_t length = NextCharLengthNoException(str);
00062     if (length == 0) {
00063       throw InvalidUTF8(str);
00064     }
00065     return length;
00066   }
00067 
00071   static size_t PrevCharLength(const char* str) {
00072     {
00073       const size_t length = NextCharLengthNoException(str - 3);
00074       if (length == 3) {
00075         return length;
00076       }
00077     }
00078     {
00079       const size_t length = NextCharLengthNoException(str - 1);
00080       if (length == 1) {
00081         return length;
00082       }
00083     }
00084     {
00085       const size_t length = NextCharLengthNoException(str - 2);
00086       if (length == 2) {
00087         return length;
00088       }
00089     }
00090     for (size_t i = 4; i <= 6; i++) {
00091       const size_t length = NextCharLengthNoException(str - i);
00092       if (length == i) {
00093         return length;
00094       }
00095     }
00096     throw InvalidUTF8(str);
00097   }
00098 
00102   static const char* NextChar(const char* str) {
00103     return str + NextCharLength(str);
00104   }
00105 
00109   static const char* PrevChar(const char* str) {
00110     return str - PrevCharLength(str);
00111   }
00112 
00116   static size_t Length(const char* str) {
00117     size_t length = 0;
00118     while (*str != '\0') {
00119       str = NextChar(str);
00120       length++;
00121     }
00122     return length;
00123   }
00124 
00131   static const char* FindNextInline(const char* str, const char ch) {
00132     while (!IsLineEndingOrFileEnding(*str) && *str != ch) {
00133       str = NextChar(str);
00134     }
00135     return str;
00136   }
00137 
00141   static bool IsLineEndingOrFileEnding(const char ch) {
00142     return ch == '\0' || ch == '\n' || ch == '\r';
00143   }
00144 
00148   static string FromSubstr(const char* str, size_t length) {
00149     string newStr;
00150     newStr.resize(length);
00151     strncpy(const_cast<char*>(newStr.c_str()), str, length);
00152     return newStr;
00153   }
00154 
00158   static bool NotShorterThan(const char* str, size_t byteLength) {
00159     while (byteLength > 0) {
00160       if (*str == '\0') {
00161         return false;
00162       }
00163       byteLength--;
00164       str++;
00165     }
00166     return true;
00167   }
00168 
00173   static string TruncateUTF8(const char* str, size_t maxByteLength) {
00174     string wordTrunc;
00175     if (NotShorterThan(str, maxByteLength)) {
00176       size_t len = 0;
00177       const char* pStr = str;
00178       for (;;) {
00179         const size_t charLength = NextCharLength(pStr);
00180         if (len + charLength > maxByteLength) {
00181           break;
00182         }
00183         pStr += charLength;
00184         len += charLength;
00185       }
00186       wordTrunc = FromSubstr(str, len);
00187     } else {
00188       wordTrunc = str;
00189     }
00190     return wordTrunc;
00191   }
00192 
00196   static void ReplaceAll(string& str, const char* from, const char* to) {
00197     string::size_type pos = 0;
00198     string::size_type fromLen = strlen(from);
00199     string::size_type toLen = strlen(to);
00200     while ((pos = str.find(from, pos)) != string::npos) {
00201       str.replace(pos, fromLen, to);
00202       pos += toLen;
00203     }
00204   }
00205 
00209   static string Join(const vector<string>& strings, const string& separator) {
00210     std::ostringstream buffer;
00211     bool first = true;
00212     for (const auto& str : strings) {
00213       if (!first) {
00214         buffer << separator;
00215       }
00216       buffer << str;
00217       first = false;
00218     }
00219     return buffer.str();
00220   }
00221 
00225   static string Join(const vector<string>& strings) {
00226     std::ostringstream buffer;
00227     for (const auto& str : strings) {
00228       buffer << str;
00229     }
00230     return buffer.str();
00231   }
00232 
00233   static void GetByteMap(const char* str, const size_t utf8Length,
00234                          vector<size_t>* byteMap) {
00235     if (byteMap->size() < utf8Length) {
00236       byteMap->resize(utf8Length);
00237     }
00238     const char* pstr = str;
00239     for (size_t i = 0; i < utf8Length; i++) {
00240       (*byteMap)[i] = pstr - str;
00241       pstr = NextChar(pstr);
00242     }
00243   }
00244 };
00245 }
 All Classes Functions