Open Chinese Convert
1.0.3
A project for conversion between Traditional and Simplified Chinese
|
00001 /* 00002 * Open Chinese Convert 00003 * 00004 * Copyright 2015 BYVoid <byvoid@byvoid.com> 00005 * 00006 * Licensed under the Apache License, Version 2.0 (the "License"); 00007 * you may not use this file except in compliance with the License. 00008 * You may obtain a copy of the License at 00009 * 00010 * http://www.apache.org/licenses/LICENSE-2.0 00011 * 00012 * Unless required by applicable law or agreed to in writing, software 00013 * distributed under the License is distributed on an "AS IS" BASIS, 00014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 * See the License for the specific language governing permissions and 00016 * limitations under the License. 00017 */ 00018 00019 #include "Common.hpp" 00020 #include "UTF8Util.hpp" 00021 00022 namespace opencc { 00023 00024 namespace internal { 00025 00026 inline size_t FNVHash(const char* text, const size_t byteLength, 00027 const size_t FNV_prime, const size_t FNV_offset_basis) { 00028 size_t hash = FNV_offset_basis; 00029 for (const char* pstr = text; pstr < text + byteLength; pstr++) { 00030 hash ^= *pstr; 00031 hash *= FNV_prime; 00032 } 00033 return hash; 00034 } 00035 00036 template <int> size_t FNVHash(const char* text, const size_t byteLength); 00037 00038 template <> 00039 inline size_t FNVHash<4>(const char* text, const size_t byteLength) { 00040 return FNVHash(text, byteLength, 16777619UL, 2166136261UL); 00041 } 00042 00043 template <> 00044 inline size_t FNVHash<8>(const char* text, const size_t byteLength) { 00045 return FNVHash(text, byteLength, 1099511628211UL, 14695981039346656037UL); 00046 } 00047 00048 } // namespace internal 00049 00050 template <typename LENGTH_TYPE> class UTF8StringSliceBase { 00051 public: 00052 typedef LENGTH_TYPE LengthType; 00053 00054 UTF8StringSliceBase(const char* _str) 00055 : str(_str), utf8Length(UTF8Util::Length(_str)), 00056 byteLength(strlen(_str)) {} 00057 00058 UTF8StringSliceBase(const char* _str, const LengthType _utf8Length) 00059 : str(_str), utf8Length(_utf8Length) { 00060 CalculateByteLength(); 00061 } 00062 00063 UTF8StringSliceBase(const char* _str, const LengthType _utf8Length, 00064 const LengthType _byteLength) 00065 : str(_str), utf8Length(_utf8Length), byteLength(_byteLength) { 00066 CalculateByteLength(); 00067 } 00068 00069 LengthType UTF8Length() const { return utf8Length; } 00070 00071 LengthType ByteLength() const { return byteLength; } 00072 00073 UTF8StringSliceBase Left(const LengthType utf8Length) const { 00074 if (utf8Length == UTF8Length()) { 00075 return *this; 00076 } else { 00077 return UTF8StringSliceBase(str, utf8Length); 00078 } 00079 } 00080 00081 UTF8StringSliceBase Right(const LengthType utf8Length) const { 00082 if (utf8Length == UTF8Length()) { 00083 return *this; 00084 } else { 00085 const char* pstr = str + byteLength; 00086 for (size_t i = 0; i < utf8Length; i++) { 00087 pstr = UTF8Util::PrevChar(pstr); 00088 } 00089 return UTF8StringSliceBase(pstr, utf8Length); 00090 } 00091 } 00092 00093 UTF8StringSliceBase SubString(const LengthType offset, 00094 const LengthType utf8Length) const { 00095 if (offset == 0) { 00096 return Left(utf8Length); 00097 } else { 00098 const char* pstr = str; 00099 for (size_t i = 0; i < offset; i++) { 00100 pstr = UTF8Util::NextChar(pstr); 00101 } 00102 return UTF8StringSliceBase(pstr, utf8Length); 00103 } 00104 } 00105 00106 string ToString() const { return string(str, str + byteLength); } 00107 00108 const char* CString() const { return str; } 00109 00110 LengthType CommonPrefixLength(const UTF8StringSliceBase& that) const { 00111 if (str == that.str) { 00112 return std::min(utf8Length, that.utf8Length); 00113 } else { 00114 const char* pstr1 = str; 00115 const char* pstr2 = that.str; 00116 for (size_t length = 0; length < utf8Length && length < that.utf8Length; 00117 length++) { 00118 size_t charLen1 = UTF8Util::NextCharLength(pstr1); 00119 size_t charLen2 = UTF8Util::NextCharLength(pstr2); 00120 if (charLen1 != charLen2 || strncmp(pstr1, pstr2, charLen1) != 0) { 00121 return length; 00122 } 00123 pstr1 += charLen1; 00124 pstr2 += charLen2; 00125 } 00126 return 0; 00127 } 00128 } 00129 00130 void MoveRight() { 00131 if (utf8Length > 0) { 00132 const size_t charLen = UTF8Util::NextCharLength(str); 00133 str += charLen; 00134 utf8Length--; 00135 byteLength -= charLen; 00136 } 00137 } 00138 00139 void MoveLeft() { 00140 if (utf8Length > 0) { 00141 const size_t charLen = UTF8Util::PrevCharLength(str + byteLength); 00142 utf8Length--; 00143 byteLength -= charLen; 00144 } 00145 } 00146 00147 int ReverseCompare(const UTF8StringSliceBase& that) const { 00148 const char* pstr1 = str + byteLength; 00149 const char* pstr2 = that.str + that.byteLength; 00150 const size_t length = std::min(utf8Length, that.utf8Length); 00151 for (size_t i = 0; i < length; i++) { 00152 const size_t charLen1 = UTF8Util::PrevCharLength(pstr1); 00153 const size_t charLen2 = UTF8Util::PrevCharLength(pstr2); 00154 pstr1 -= charLen1; 00155 pstr2 -= charLen2; 00156 const int cmp = strncmp(pstr1, pstr2, std::min(charLen1, charLen2)); 00157 if (cmp < 0) { 00158 return -1; 00159 } else if (cmp > 0) { 00160 return 1; 00161 } else if (charLen1 < charLen2) { 00162 return -1; 00163 } else if (charLen1 > charLen2) { 00164 return 1; 00165 } 00166 } 00167 if (utf8Length < that.utf8Length) { 00168 return -1; 00169 } else if (utf8Length > that.utf8Length) { 00170 return 1; 00171 } else { 00172 return 0; 00173 } 00174 } 00175 00176 LengthType FindBytePosition(const UTF8StringSliceBase& pattern) const { 00177 return static_cast<LengthType>( 00178 ToString().find(pattern.str, 0, pattern.byteLength)); 00179 } 00180 00181 bool operator<(const UTF8StringSliceBase& that) const { 00182 return Compare(that) < 0; 00183 } 00184 00185 bool operator>(const UTF8StringSliceBase& that) const { 00186 return Compare(that) > 0; 00187 } 00188 00189 bool operator==(const UTF8StringSliceBase& that) const { 00190 return (str == that.str && utf8Length == that.utf8Length) || 00191 Compare(that) == 0; 00192 } 00193 00194 bool operator!=(const UTF8StringSliceBase& that) const { 00195 return !this->operator==(that); 00196 } 00197 00198 class Hasher { 00199 public: 00200 size_t operator()(const UTF8StringSliceBase& text) const { 00201 return internal::FNVHash<sizeof(size_t)>(text.CString(), 00202 text.ByteLength()); 00203 } 00204 }; 00205 00206 private: 00207 inline int Compare(const UTF8StringSliceBase& that) const { 00208 int cmp = strncmp(str, that.str, std::min(byteLength, that.byteLength)); 00209 if (cmp == 0) { 00210 if (utf8Length < that.utf8Length) { 00211 cmp = -1; 00212 } else if (utf8Length > that.utf8Length) { 00213 cmp = 1; 00214 } else { 00215 cmp = 0; 00216 } 00217 } 00218 return cmp; 00219 } 00220 00221 void CalculateByteLength() { 00222 const char* pstr = str; 00223 for (size_t i = 0; i < utf8Length; i++) { 00224 pstr = UTF8Util::NextChar(pstr); 00225 } 00226 byteLength = pstr - str; 00227 } 00228 00229 const char* str; 00230 LengthType utf8Length; 00231 LengthType byteLength; 00232 }; 00233 00234 typedef UTF8StringSliceBase<size_t> UTF8StringSlice; 00235 00236 template <typename LENGTH_TYPE> 00237 std::ostream& operator<<(::std::ostream& os, 00238 const UTF8StringSliceBase<LENGTH_TYPE>& str) { 00239 return os << str.ToString(); 00240 } 00241 00242 } // namespace opencc