Open Chinese Convert  1.0.3
A project for conversion between Traditional and Simplified Chinese
UTF8StringSlice.hpp
00001 /*
00002  * Open Chinese Convert
00003  *
00004  * Copyright 2015 BYVoid <byvoid@byvoid.com>
00005  *
00006  * Licensed under the Apache License, Version 2.0 (the "License");
00007  * you may not use this file except in compliance with the License.
00008  * You may obtain a copy of the License at
00009  *
00010  *      http://www.apache.org/licenses/LICENSE-2.0
00011  *
00012  * Unless required by applicable law or agreed to in writing, software
00013  * distributed under the License is distributed on an "AS IS" BASIS,
00014  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  * See the License for the specific language governing permissions and
00016  * limitations under the License.
00017  */
00018 
00019 #include "Common.hpp"
00020 #include "UTF8Util.hpp"
00021 
00022 namespace opencc {
00023 
00024 namespace internal {
00025 
00026 inline size_t FNVHash(const char* text, const size_t byteLength,
00027                       const size_t FNV_prime, const size_t FNV_offset_basis) {
00028   size_t hash = FNV_offset_basis;
00029   for (const char* pstr = text; pstr < text + byteLength; pstr++) {
00030     hash ^= *pstr;
00031     hash *= FNV_prime;
00032   }
00033   return hash;
00034 }
00035 
00036 template <int> size_t FNVHash(const char* text, const size_t byteLength);
00037 
00038 template <>
00039 inline size_t FNVHash<4>(const char* text, const size_t byteLength) {
00040   return FNVHash(text, byteLength, 16777619UL, 2166136261UL);
00041 }
00042 
00043 template <>
00044 inline size_t FNVHash<8>(const char* text, const size_t byteLength) {
00045   return FNVHash(text, byteLength, 1099511628211UL, 14695981039346656037UL);
00046 }
00047 
00048 } // namespace internal
00049 
00050 template <typename LENGTH_TYPE> class UTF8StringSliceBase {
00051 public:
00052   typedef LENGTH_TYPE LengthType;
00053 
00054   UTF8StringSliceBase(const char* _str)
00055       : str(_str), utf8Length(UTF8Util::Length(_str)),
00056         byteLength(strlen(_str)) {}
00057 
00058   UTF8StringSliceBase(const char* _str, const LengthType _utf8Length)
00059       : str(_str), utf8Length(_utf8Length) {
00060     CalculateByteLength();
00061   }
00062 
00063   UTF8StringSliceBase(const char* _str, const LengthType _utf8Length,
00064                       const LengthType _byteLength)
00065       : str(_str), utf8Length(_utf8Length), byteLength(_byteLength) {
00066     CalculateByteLength();
00067   }
00068 
00069   LengthType UTF8Length() const { return utf8Length; }
00070 
00071   LengthType ByteLength() const { return byteLength; }
00072 
00073   UTF8StringSliceBase Left(const LengthType utf8Length) const {
00074     if (utf8Length == UTF8Length()) {
00075       return *this;
00076     } else {
00077       return UTF8StringSliceBase(str, utf8Length);
00078     }
00079   }
00080 
00081   UTF8StringSliceBase Right(const LengthType utf8Length) const {
00082     if (utf8Length == UTF8Length()) {
00083       return *this;
00084     } else {
00085       const char* pstr = str + byteLength;
00086       for (size_t i = 0; i < utf8Length; i++) {
00087         pstr = UTF8Util::PrevChar(pstr);
00088       }
00089       return UTF8StringSliceBase(pstr, utf8Length);
00090     }
00091   }
00092 
00093   UTF8StringSliceBase SubString(const LengthType offset,
00094                                 const LengthType utf8Length) const {
00095     if (offset == 0) {
00096       return Left(utf8Length);
00097     } else {
00098       const char* pstr = str;
00099       for (size_t i = 0; i < offset; i++) {
00100         pstr = UTF8Util::NextChar(pstr);
00101       }
00102       return UTF8StringSliceBase(pstr, utf8Length);
00103     }
00104   }
00105 
00106   string ToString() const { return string(str, str + byteLength); }
00107 
00108   const char* CString() const { return str; }
00109 
00110   LengthType CommonPrefixLength(const UTF8StringSliceBase& that) const {
00111     if (str == that.str) {
00112       return std::min(utf8Length, that.utf8Length);
00113     } else {
00114       const char* pstr1 = str;
00115       const char* pstr2 = that.str;
00116       for (size_t length = 0; length < utf8Length && length < that.utf8Length;
00117            length++) {
00118         size_t charLen1 = UTF8Util::NextCharLength(pstr1);
00119         size_t charLen2 = UTF8Util::NextCharLength(pstr2);
00120         if (charLen1 != charLen2 || strncmp(pstr1, pstr2, charLen1) != 0) {
00121           return length;
00122         }
00123         pstr1 += charLen1;
00124         pstr2 += charLen2;
00125       }
00126       return 0;
00127     }
00128   }
00129 
00130   void MoveRight() {
00131     if (utf8Length > 0) {
00132       const size_t charLen = UTF8Util::NextCharLength(str);
00133       str += charLen;
00134       utf8Length--;
00135       byteLength -= charLen;
00136     }
00137   }
00138 
00139   void MoveLeft() {
00140     if (utf8Length > 0) {
00141       const size_t charLen = UTF8Util::PrevCharLength(str + byteLength);
00142       utf8Length--;
00143       byteLength -= charLen;
00144     }
00145   }
00146 
00147   int ReverseCompare(const UTF8StringSliceBase& that) const {
00148     const char* pstr1 = str + byteLength;
00149     const char* pstr2 = that.str + that.byteLength;
00150     const size_t length = std::min(utf8Length, that.utf8Length);
00151     for (size_t i = 0; i < length; i++) {
00152       const size_t charLen1 = UTF8Util::PrevCharLength(pstr1);
00153       const size_t charLen2 = UTF8Util::PrevCharLength(pstr2);
00154       pstr1 -= charLen1;
00155       pstr2 -= charLen2;
00156       const int cmp = strncmp(pstr1, pstr2, std::min(charLen1, charLen2));
00157       if (cmp < 0) {
00158         return -1;
00159       } else if (cmp > 0) {
00160         return 1;
00161       } else if (charLen1 < charLen2) {
00162         return -1;
00163       } else if (charLen1 > charLen2) {
00164         return 1;
00165       }
00166     }
00167     if (utf8Length < that.utf8Length) {
00168       return -1;
00169     } else if (utf8Length > that.utf8Length) {
00170       return 1;
00171     } else {
00172       return 0;
00173     }
00174   }
00175 
00176   LengthType FindBytePosition(const UTF8StringSliceBase& pattern) const {
00177     return static_cast<LengthType>(
00178         ToString().find(pattern.str, 0, pattern.byteLength));
00179   }
00180 
00181   bool operator<(const UTF8StringSliceBase& that) const {
00182     return Compare(that) < 0;
00183   }
00184 
00185   bool operator>(const UTF8StringSliceBase& that) const {
00186     return Compare(that) > 0;
00187   }
00188 
00189   bool operator==(const UTF8StringSliceBase& that) const {
00190     return (str == that.str && utf8Length == that.utf8Length) ||
00191            Compare(that) == 0;
00192   }
00193 
00194   bool operator!=(const UTF8StringSliceBase& that) const {
00195     return !this->operator==(that);
00196   }
00197 
00198   class Hasher {
00199   public:
00200     size_t operator()(const UTF8StringSliceBase& text) const {
00201       return internal::FNVHash<sizeof(size_t)>(text.CString(),
00202                                                text.ByteLength());
00203     }
00204   };
00205 
00206 private:
00207   inline int Compare(const UTF8StringSliceBase& that) const {
00208     int cmp = strncmp(str, that.str, std::min(byteLength, that.byteLength));
00209     if (cmp == 0) {
00210       if (utf8Length < that.utf8Length) {
00211         cmp = -1;
00212       } else if (utf8Length > that.utf8Length) {
00213         cmp = 1;
00214       } else {
00215         cmp = 0;
00216       }
00217     }
00218     return cmp;
00219   }
00220 
00221   void CalculateByteLength() {
00222     const char* pstr = str;
00223     for (size_t i = 0; i < utf8Length; i++) {
00224       pstr = UTF8Util::NextChar(pstr);
00225     }
00226     byteLength = pstr - str;
00227   }
00228 
00229   const char* str;
00230   LengthType utf8Length;
00231   LengthType byteLength;
00232 };
00233 
00234 typedef UTF8StringSliceBase<size_t> UTF8StringSlice;
00235 
00236 template <typename LENGTH_TYPE>
00237 std::ostream& operator<<(::std::ostream& os,
00238                          const UTF8StringSliceBase<LENGTH_TYPE>& str) {
00239   return os << str.ToString();
00240 }
00241 
00242 } // namespace opencc
 All Classes Functions