Botan
1.11.15
|
00001 /* 00002 * Character Set Handling 00003 * (C) 1999-2007 Jack Lloyd 00004 * 00005 * Botan is released under the Simplified BSD License (see license.txt) 00006 */ 00007 00008 #include <botan/charset.h> 00009 #include <botan/parsing.h> 00010 #include <botan/exceptn.h> 00011 #include <cctype> 00012 00013 namespace Botan { 00014 00015 namespace Charset { 00016 00017 namespace { 00018 00019 /* 00020 * Convert from UCS-2 to ISO 8859-1 00021 */ 00022 std::string ucs2_to_latin1(const std::string& ucs2) 00023 { 00024 if(ucs2.size() % 2 == 1) 00025 throw Decoding_Error("UCS-2 string has an odd number of bytes"); 00026 00027 std::string latin1; 00028 00029 for(size_t i = 0; i != ucs2.size(); i += 2) 00030 { 00031 const byte c1 = ucs2[i]; 00032 const byte c2 = ucs2[i+1]; 00033 00034 if(c1 != 0) 00035 throw Decoding_Error("UCS-2 has non-Latin1 characters"); 00036 00037 latin1 += static_cast<char>(c2); 00038 } 00039 00040 return latin1; 00041 } 00042 00043 /* 00044 * Convert from UTF-8 to ISO 8859-1 00045 */ 00046 std::string utf8_to_latin1(const std::string& utf8) 00047 { 00048 std::string iso8859; 00049 00050 size_t position = 0; 00051 while(position != utf8.size()) 00052 { 00053 const byte c1 = static_cast<byte>(utf8[position++]); 00054 00055 if(c1 <= 0x7F) 00056 iso8859 += static_cast<char>(c1); 00057 else if(c1 >= 0xC0 && c1 <= 0xC7) 00058 { 00059 if(position == utf8.size()) 00060 throw Decoding_Error("UTF-8: sequence truncated"); 00061 00062 const byte c2 = static_cast<byte>(utf8[position++]); 00063 const byte iso_char = ((c1 & 0x07) << 6) | (c2 & 0x3F); 00064 00065 if(iso_char <= 0x7F) 00066 throw Decoding_Error("UTF-8: sequence longer than needed"); 00067 00068 iso8859 += static_cast<char>(iso_char); 00069 } 00070 else 00071 throw Decoding_Error("UTF-8: Unicode chars not in Latin1 used"); 00072 } 00073 00074 return iso8859; 00075 } 00076 00077 /* 00078 * Convert from ISO 8859-1 to UTF-8 00079 */ 00080 std::string latin1_to_utf8(const std::string& iso8859) 00081 { 00082 std::string utf8; 00083 for(size_t i = 0; i != iso8859.size(); ++i) 00084 { 00085 const byte c = static_cast<byte>(iso8859[i]); 00086 00087 if(c <= 0x7F) 00088 utf8 += static_cast<char>(c); 00089 else 00090 { 00091 utf8 += static_cast<char>((0xC0 | (c >> 6))); 00092 utf8 += static_cast<char>((0x80 | (c & 0x3F))); 00093 } 00094 } 00095 return utf8; 00096 } 00097 00098 } 00099 00100 /* 00101 * Perform character set transcoding 00102 */ 00103 std::string transcode(const std::string& str, 00104 Character_Set to, Character_Set from) 00105 { 00106 if(to == LOCAL_CHARSET) 00107 to = LATIN1_CHARSET; 00108 if(from == LOCAL_CHARSET) 00109 from = LATIN1_CHARSET; 00110 00111 if(to == from) 00112 return str; 00113 00114 if(from == LATIN1_CHARSET && to == UTF8_CHARSET) 00115 return latin1_to_utf8(str); 00116 if(from == UTF8_CHARSET && to == LATIN1_CHARSET) 00117 return utf8_to_latin1(str); 00118 if(from == UCS2_CHARSET && to == LATIN1_CHARSET) 00119 return ucs2_to_latin1(str); 00120 00121 throw Invalid_Argument("Unknown transcoding operation from " + 00122 std::to_string(from) + " to " + std::to_string(to)); 00123 } 00124 00125 /* 00126 * Check if a character represents a digit 00127 */ 00128 bool is_digit(char c) 00129 { 00130 if(c == '0' || c == '1' || c == '2' || c == '3' || c == '4' || 00131 c == '5' || c == '6' || c == '7' || c == '8' || c == '9') 00132 return true; 00133 return false; 00134 } 00135 00136 /* 00137 * Check if a character represents whitespace 00138 */ 00139 bool is_space(char c) 00140 { 00141 if(c == ' ' || c == '\t' || c == '\n' || c == '\r') 00142 return true; 00143 return false; 00144 } 00145 00146 /* 00147 * Convert a character to a digit 00148 */ 00149 byte char2digit(char c) 00150 { 00151 switch(c) 00152 { 00153 case '0': return 0; 00154 case '1': return 1; 00155 case '2': return 2; 00156 case '3': return 3; 00157 case '4': return 4; 00158 case '5': return 5; 00159 case '6': return 6; 00160 case '7': return 7; 00161 case '8': return 8; 00162 case '9': return 9; 00163 } 00164 00165 throw Invalid_Argument("char2digit: Input is not a digit character"); 00166 } 00167 00168 /* 00169 * Convert a digit to a character 00170 */ 00171 char digit2char(byte b) 00172 { 00173 switch(b) 00174 { 00175 case 0: return '0'; 00176 case 1: return '1'; 00177 case 2: return '2'; 00178 case 3: return '3'; 00179 case 4: return '4'; 00180 case 5: return '5'; 00181 case 6: return '6'; 00182 case 7: return '7'; 00183 case 8: return '8'; 00184 case 9: return '9'; 00185 } 00186 00187 throw Invalid_Argument("digit2char: Input is not a digit"); 00188 } 00189 00190 /* 00191 * Case-insensitive character comparison 00192 */ 00193 bool caseless_cmp(char a, char b) 00194 { 00195 return (std::tolower(static_cast<unsigned char>(a)) == 00196 std::tolower(static_cast<unsigned char>(b))); 00197 } 00198 00199 } 00200 00201 }