Open Chinese Convert
1.0.3
A project for conversion between Traditional and Simplified Chinese
|
00001 /* 00002 * Open Chinese Convert 00003 * 00004 * Copyright 2015 BYVoid <byvoid@byvoid.com> 00005 * 00006 * Licensed under the Apache License, Version 2.0 (the "License"); 00007 * you may not use this file except in compliance with the License. 00008 * You may obtain a copy of the License at 00009 * 00010 * http://www.apache.org/licenses/LICENSE-2.0 00011 * 00012 * Unless required by applicable law or agreed to in writing, software 00013 * distributed under the License is distributed on an "AS IS" BASIS, 00014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 00015 * See the License for the specific language governing permissions and 00016 * limitations under the License. 00017 */ 00018 00019 #pragma once 00020 00021 #include <unordered_map> 00022 00023 #include "Common.hpp" 00024 #include "UTF8StringSlice.hpp" 00025 00026 namespace opencc { 00027 00028 class PhraseExtract { 00029 public: 00030 typedef UTF8StringSlice::LengthType LengthType; 00031 00032 typedef UTF8StringSliceBase<unsigned char> UTF8StringSlice8Bit; 00033 00034 PhraseExtract(); 00035 00036 virtual ~PhraseExtract(); 00037 00038 void Extract(const string& text) { 00039 SetFullText(text); 00040 ExtractSuffixes(); 00041 CalculateFrequency(); 00042 CalculateSuffixEntropy(); 00043 ReleaseSuffixes(); 00044 ExtractPrefixes(); 00045 CalculatePrefixEntropy(); 00046 ReleasePrefixes(); 00047 ExtractWordCandidates(); 00048 CalculateCohesions(); 00049 SelectWords(); 00050 } 00051 00052 void SetFullText(const string& fullText) { 00053 utf8FullText = UTF8StringSlice(fullText.c_str()); 00054 } 00055 00056 void SetFullText(const char* fullText) { 00057 utf8FullText = UTF8StringSlice(fullText); 00058 } 00059 00060 void SetFullText(const UTF8StringSlice& fullText) { utf8FullText = fullText; } 00061 00062 void SetWordMinLength(const LengthType _wordMinLength) { 00063 wordMinLength = _wordMinLength; 00064 } 00065 00066 void SetWordMaxLength(const LengthType _wordMaxLength) { 00067 wordMaxLength = _wordMaxLength; 00068 } 00069 00070 void SetPrefixSetLength(const LengthType _prefixSetLength) { 00071 prefixSetLength = _prefixSetLength; 00072 } 00073 00074 void SetSuffixSetLength(const LengthType _suffixSetLength) { 00075 suffixSetLength = _suffixSetLength; 00076 } 00077 00078 // PreCalculationFilter is called after frequencies statistics. 00079 void SetPreCalculationFilter(const std::function< 00080 bool(const PhraseExtract&, const UTF8StringSlice8Bit&)>& filter) { 00081 preCalculationFilter = filter; 00082 } 00083 00084 void SetPostCalculationFilter(const std::function< 00085 bool(const PhraseExtract&, const UTF8StringSlice8Bit&)>& filter) { 00086 postCalculationFilter = filter; 00087 } 00088 00089 void ReleaseSuffixes() { vector<UTF8StringSlice8Bit>().swap(suffixes); } 00090 00091 void ReleasePrefixes() { vector<UTF8StringSlice8Bit>().swap(prefixes); } 00092 00093 const vector<UTF8StringSlice8Bit>& Words() const { return words; } 00094 00095 const vector<UTF8StringSlice8Bit>& WordCandidates() const { 00096 return wordCandidates; 00097 } 00098 00099 struct Signals { 00100 size_t frequency; 00101 double cohesion; 00102 double suffixEntropy; 00103 double prefixEntropy; 00104 }; 00105 00106 const Signals& Signal(const UTF8StringSlice8Bit& wordCandidate) const; 00107 00108 double Cohesion(const UTF8StringSlice8Bit& wordCandidate) const; 00109 00110 double Entropy(const UTF8StringSlice8Bit& wordCandidate) const; 00111 00112 double SuffixEntropy(const UTF8StringSlice8Bit& wordCandidate) const; 00113 00114 double PrefixEntropy(const UTF8StringSlice8Bit& wordCandidate) const; 00115 00116 size_t Frequency(const UTF8StringSlice8Bit& word) const; 00117 00118 double Probability(const UTF8StringSlice8Bit& word) const; 00119 00120 double LogProbability(const UTF8StringSlice8Bit& word) const; 00121 00122 void Reset(); 00123 00124 void ExtractSuffixes(); 00125 00126 void ExtractPrefixes(); 00127 00128 void ExtractWordCandidates(); 00129 00130 void CalculateFrequency(); 00131 00132 void CalculateCohesions(); 00133 00134 void CalculateSuffixEntropy(); 00135 00136 void CalculatePrefixEntropy(); 00137 00138 void SelectWords(); 00139 00140 static bool 00141 DefaultPreCalculationFilter(const PhraseExtract&, 00142 const PhraseExtract::UTF8StringSlice8Bit&); 00143 00144 static bool 00145 DefaultPostCalculationFilter(const PhraseExtract&, 00146 const PhraseExtract::UTF8StringSlice8Bit&); 00147 00148 private: 00149 class DictType; 00150 00151 // Pointwise Mutual Information 00152 double PMI(const UTF8StringSlice8Bit& wordCandidate, 00153 const UTF8StringSlice8Bit& part1, 00154 const UTF8StringSlice8Bit& part2) const; 00155 00156 double CalculateCohesion(const UTF8StringSlice8Bit& wordCandidate) const; 00157 00158 double CalculateEntropy(const std::unordered_map< 00159 UTF8StringSlice8Bit, size_t, UTF8StringSlice8Bit::Hasher>& choices) const; 00160 00161 LengthType wordMinLength; 00162 LengthType wordMaxLength; 00163 LengthType prefixSetLength; 00164 LengthType suffixSetLength; 00165 std::function<bool(const PhraseExtract&, const UTF8StringSlice8Bit&)> 00166 preCalculationFilter; 00167 std::function<bool(const PhraseExtract&, const UTF8StringSlice8Bit&)> 00168 postCalculationFilter; 00169 00170 bool prefixesExtracted; 00171 bool suffixesExtracted; 00172 bool frequenciesCalculated; 00173 bool wordCandidatesExtracted; 00174 bool cohesionsCalculated; 00175 bool prefixEntropiesCalculated; 00176 bool suffixEntropiesCalculated; 00177 bool wordsSelected; 00178 00179 UTF8StringSlice utf8FullText; 00180 size_t totalOccurrence; 00181 double logTotalOccurrence; 00182 vector<UTF8StringSlice8Bit> prefixes; 00183 vector<UTF8StringSlice8Bit> suffixes; 00184 vector<UTF8StringSlice8Bit> wordCandidates; 00185 vector<UTF8StringSlice8Bit> words; 00186 DictType* signals; 00187 00188 friend class PhraseExtractTest; 00189 }; 00190 00191 } // namespace opencc