Open Chinese Convert
1.0.3
A project for conversion between Traditional and Simplified Chinese
|
00001 /* 00002 * Open Chinese Convert 00003 * 00004 * Copyright 2013 BYVoid <byvoid@byvoid.com> 00005 * 00006 * Licensed under the Apache License, Version 2.0 (the "License"); 00007 * you may not use this file except in compliance with the License. 00008 * You may obtain a copy of the License at 00009 * 00010 * http://www.apache.org/licenses/LICENSE-2.0 00011 * 00012 * Unless required by applicable law or agreed to in writing, software 00013 * distributed under the License is distributed on an "AS IS" BASIS, 00014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 * See the License for the specific language governing permissions and 00016 * limitations under the License. 00017 */ 00018 00019 #pragma once 00020 00021 #include "Common.hpp" 00022 00023 namespace opencc { 00028 class OPENCC_EXPORT UTF8Util { 00029 public: 00033 static void SkipUtf8Bom(FILE* fp); 00034 00039 static size_t NextCharLengthNoException(const char* str) { 00040 char ch = *str; 00041 if ((ch & 0xF0) == 0xE0) { 00042 return 3; 00043 } else if ((ch & 0x80) == 0x00) { 00044 return 1; 00045 } else if ((ch & 0xE0) == 0xC0) { 00046 return 2; 00047 } else if ((ch & 0xF8) == 0xF0) { 00048 return 4; 00049 } else if ((ch & 0xFC) == 0xF8) { 00050 return 5; 00051 } else if ((ch & 0xFE) == 0xFC) { 00052 return 6; 00053 } 00054 return 0; 00055 } 00056 00060 static size_t NextCharLength(const char* str) { 00061 size_t length = NextCharLengthNoException(str); 00062 if (length == 0) { 00063 throw InvalidUTF8(str); 00064 } 00065 return length; 00066 } 00067 00071 static size_t PrevCharLength(const char* str) { 00072 { 00073 const size_t length = NextCharLengthNoException(str - 3); 00074 if (length == 3) { 00075 return length; 00076 } 00077 } 00078 { 00079 const size_t length = NextCharLengthNoException(str - 1); 00080 if (length == 1) { 00081 return length; 00082 } 00083 } 00084 { 00085 const size_t length = NextCharLengthNoException(str - 2); 00086 if (length == 2) { 00087 return length; 00088 } 00089 } 00090 for (size_t i = 4; i <= 6; i++) { 00091 const size_t length = NextCharLengthNoException(str - i); 00092 if (length == i) { 00093 return length; 00094 } 00095 } 00096 throw InvalidUTF8(str); 00097 } 00098 00102 static const char* NextChar(const char* str) { 00103 return str + NextCharLength(str); 00104 } 00105 00109 static const char* PrevChar(const char* str) { 00110 return str - PrevCharLength(str); 00111 } 00112 00116 static size_t Length(const char* str) { 00117 size_t length = 0; 00118 while (*str != '\0') { 00119 str = NextChar(str); 00120 length++; 00121 } 00122 return length; 00123 } 00124 00131 static const char* FindNextInline(const char* str, const char ch) { 00132 while (!IsLineEndingOrFileEnding(*str) && *str != ch) { 00133 str = NextChar(str); 00134 } 00135 return str; 00136 } 00137 00141 static bool IsLineEndingOrFileEnding(const char ch) { 00142 return ch == '\0' || ch == '\n' || ch == '\r'; 00143 } 00144 00148 static string FromSubstr(const char* str, size_t length) { 00149 string newStr; 00150 newStr.resize(length); 00151 strncpy(const_cast<char*>(newStr.c_str()), str, length); 00152 return newStr; 00153 } 00154 00158 static bool NotShorterThan(const char* str, size_t byteLength) { 00159 while (byteLength > 0) { 00160 if (*str == '\0') { 00161 return false; 00162 } 00163 byteLength--; 00164 str++; 00165 } 00166 return true; 00167 } 00168 00173 static string TruncateUTF8(const char* str, size_t maxByteLength) { 00174 string wordTrunc; 00175 if (NotShorterThan(str, maxByteLength)) { 00176 size_t len = 0; 00177 const char* pStr = str; 00178 for (;;) { 00179 const size_t charLength = NextCharLength(pStr); 00180 if (len + charLength > maxByteLength) { 00181 break; 00182 } 00183 pStr += charLength; 00184 len += charLength; 00185 } 00186 wordTrunc = FromSubstr(str, len); 00187 } else { 00188 wordTrunc = str; 00189 } 00190 return wordTrunc; 00191 } 00192 00196 static void ReplaceAll(string& str, const char* from, const char* to) { 00197 string::size_type pos = 0; 00198 string::size_type fromLen = strlen(from); 00199 string::size_type toLen = strlen(to); 00200 while ((pos = str.find(from, pos)) != string::npos) { 00201 str.replace(pos, fromLen, to); 00202 pos += toLen; 00203 } 00204 } 00205 00209 static string Join(const vector<string>& strings, const string& separator) { 00210 std::ostringstream buffer; 00211 bool first = true; 00212 for (const auto& str : strings) { 00213 if (!first) { 00214 buffer << separator; 00215 } 00216 buffer << str; 00217 first = false; 00218 } 00219 return buffer.str(); 00220 } 00221 00225 static string Join(const vector<string>& strings) { 00226 std::ostringstream buffer; 00227 for (const auto& str : strings) { 00228 buffer << str; 00229 } 00230 return buffer.str(); 00231 } 00232 00233 static void GetByteMap(const char* str, const size_t utf8Length, 00234 vector<size_t>* byteMap) { 00235 if (byteMap->size() < utf8Length) { 00236 byteMap->resize(utf8Length); 00237 } 00238 const char* pstr = str; 00239 for (size_t i = 0; i < utf8Length; i++) { 00240 (*byteMap)[i] = pstr - str; 00241 pstr = NextChar(pstr); 00242 } 00243 } 00244 }; 00245 }