Open Chinese Convert  1.0.3
A project for conversion between Traditional and Simplified Chinese
PhraseExtract.hpp
00001 /*
00002  * Open Chinese Convert
00003  *
00004  * Copyright 2015 BYVoid <byvoid@byvoid.com>
00005  *
00006  * Licensed under the Apache License, Version 2.0 (the "License");
00007  * you may not use this file except in compliance with the License.
00008  * You may obtain a copy of the License at
00009  *
00010  *      http://www.apache.org/licenses/LICENSE-2.0
00011  *
00012  * Unless required by applicable law or agreed to in writing, software
00013  * distributed under the License is distributed on an "AS IS" BASIS,
00014  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00015  * See the License for the specific language governing permissions and
00016  * limitations under the License.
00017  */
00018 
00019 #pragma once
00020 
00021 #include <unordered_map>
00022 
00023 #include "Common.hpp"
00024 #include "UTF8StringSlice.hpp"
00025 
00026 namespace opencc {
00027 
00028 class PhraseExtract {
00029 public:
00030   typedef UTF8StringSlice::LengthType LengthType;
00031 
00032   typedef UTF8StringSliceBase<unsigned char> UTF8StringSlice8Bit;
00033 
00034   PhraseExtract();
00035 
00036   virtual ~PhraseExtract();
00037 
00038   void Extract(const string& text) {
00039     SetFullText(text);
00040     ExtractSuffixes();
00041     CalculateFrequency();
00042     CalculateSuffixEntropy();
00043     ReleaseSuffixes();
00044     ExtractPrefixes();
00045     CalculatePrefixEntropy();
00046     ReleasePrefixes();
00047     ExtractWordCandidates();
00048     CalculateCohesions();
00049     SelectWords();
00050   }
00051 
00052   void SetFullText(const string& fullText) {
00053     utf8FullText = UTF8StringSlice(fullText.c_str());
00054   }
00055 
00056   void SetFullText(const char* fullText) {
00057     utf8FullText = UTF8StringSlice(fullText);
00058   }
00059 
00060   void SetFullText(const UTF8StringSlice& fullText) { utf8FullText = fullText; }
00061 
00062   void SetWordMinLength(const LengthType _wordMinLength) {
00063     wordMinLength = _wordMinLength;
00064   }
00065 
00066   void SetWordMaxLength(const LengthType _wordMaxLength) {
00067     wordMaxLength = _wordMaxLength;
00068   }
00069 
00070   void SetPrefixSetLength(const LengthType _prefixSetLength) {
00071     prefixSetLength = _prefixSetLength;
00072   }
00073 
00074   void SetSuffixSetLength(const LengthType _suffixSetLength) {
00075     suffixSetLength = _suffixSetLength;
00076   }
00077 
00078   // PreCalculationFilter is called after frequencies statistics.
00079   void SetPreCalculationFilter(const std::function<
00080       bool(const PhraseExtract&, const UTF8StringSlice8Bit&)>& filter) {
00081     preCalculationFilter = filter;
00082   }
00083 
00084   void SetPostCalculationFilter(const std::function<
00085       bool(const PhraseExtract&, const UTF8StringSlice8Bit&)>& filter) {
00086     postCalculationFilter = filter;
00087   }
00088 
00089   void ReleaseSuffixes() { vector<UTF8StringSlice8Bit>().swap(suffixes); }
00090 
00091   void ReleasePrefixes() { vector<UTF8StringSlice8Bit>().swap(prefixes); }
00092 
00093   const vector<UTF8StringSlice8Bit>& Words() const { return words; }
00094 
00095   const vector<UTF8StringSlice8Bit>& WordCandidates() const {
00096     return wordCandidates;
00097   }
00098 
00099   struct Signals {
00100     size_t frequency;
00101     double cohesion;
00102     double suffixEntropy;
00103     double prefixEntropy;
00104   };
00105 
00106   const Signals& Signal(const UTF8StringSlice8Bit& wordCandidate) const;
00107 
00108   double Cohesion(const UTF8StringSlice8Bit& wordCandidate) const;
00109 
00110   double Entropy(const UTF8StringSlice8Bit& wordCandidate) const;
00111 
00112   double SuffixEntropy(const UTF8StringSlice8Bit& wordCandidate) const;
00113 
00114   double PrefixEntropy(const UTF8StringSlice8Bit& wordCandidate) const;
00115 
00116   size_t Frequency(const UTF8StringSlice8Bit& word) const;
00117 
00118   double Probability(const UTF8StringSlice8Bit& word) const;
00119 
00120   double LogProbability(const UTF8StringSlice8Bit& word) const;
00121 
00122   void Reset();
00123 
00124   void ExtractSuffixes();
00125 
00126   void ExtractPrefixes();
00127 
00128   void ExtractWordCandidates();
00129 
00130   void CalculateFrequency();
00131 
00132   void CalculateCohesions();
00133 
00134   void CalculateSuffixEntropy();
00135 
00136   void CalculatePrefixEntropy();
00137 
00138   void SelectWords();
00139 
00140   static bool
00141   DefaultPreCalculationFilter(const PhraseExtract&,
00142                               const PhraseExtract::UTF8StringSlice8Bit&);
00143 
00144   static bool
00145   DefaultPostCalculationFilter(const PhraseExtract&,
00146                                const PhraseExtract::UTF8StringSlice8Bit&);
00147 
00148 private:
00149   class DictType;
00150 
00151   // Pointwise Mutual Information
00152   double PMI(const UTF8StringSlice8Bit& wordCandidate,
00153              const UTF8StringSlice8Bit& part1,
00154              const UTF8StringSlice8Bit& part2) const;
00155 
00156   double CalculateCohesion(const UTF8StringSlice8Bit& wordCandidate) const;
00157 
00158   double CalculateEntropy(const std::unordered_map<
00159       UTF8StringSlice8Bit, size_t, UTF8StringSlice8Bit::Hasher>& choices) const;
00160 
00161   LengthType wordMinLength;
00162   LengthType wordMaxLength;
00163   LengthType prefixSetLength;
00164   LengthType suffixSetLength;
00165   std::function<bool(const PhraseExtract&, const UTF8StringSlice8Bit&)>
00166       preCalculationFilter;
00167   std::function<bool(const PhraseExtract&, const UTF8StringSlice8Bit&)>
00168       postCalculationFilter;
00169 
00170   bool prefixesExtracted;
00171   bool suffixesExtracted;
00172   bool frequenciesCalculated;
00173   bool wordCandidatesExtracted;
00174   bool cohesionsCalculated;
00175   bool prefixEntropiesCalculated;
00176   bool suffixEntropiesCalculated;
00177   bool wordsSelected;
00178 
00179   UTF8StringSlice utf8FullText;
00180   size_t totalOccurrence;
00181   double logTotalOccurrence;
00182   vector<UTF8StringSlice8Bit> prefixes;
00183   vector<UTF8StringSlice8Bit> suffixes;
00184   vector<UTF8StringSlice8Bit> wordCandidates;
00185   vector<UTF8StringSlice8Bit> words;
00186   DictType* signals;
00187 
00188   friend class PhraseExtractTest;
00189 };
00190 
00191 } // namespace opencc
 All Classes Functions