Botan  1.11.15
src/lib/utils/charset.cpp
Go to the documentation of this file.
00001 /*
00002 * Character Set Handling
00003 * (C) 1999-2007 Jack Lloyd
00004 *
00005 * Botan is released under the Simplified BSD License (see license.txt)
00006 */
00007 
00008 #include <botan/charset.h>
00009 #include <botan/parsing.h>
00010 #include <botan/exceptn.h>
00011 #include <cctype>
00012 
00013 namespace Botan {
00014 
00015 namespace Charset {
00016 
00017 namespace {
00018 
00019 /*
00020 * Convert from UCS-2 to ISO 8859-1
00021 */
00022 std::string ucs2_to_latin1(const std::string& ucs2)
00023    {
00024    if(ucs2.size() % 2 == 1)
00025       throw Decoding_Error("UCS-2 string has an odd number of bytes");
00026 
00027    std::string latin1;
00028 
00029    for(size_t i = 0; i != ucs2.size(); i += 2)
00030       {
00031       const byte c1 = ucs2[i];
00032       const byte c2 = ucs2[i+1];
00033 
00034       if(c1 != 0)
00035          throw Decoding_Error("UCS-2 has non-Latin1 characters");
00036 
00037       latin1 += static_cast<char>(c2);
00038       }
00039 
00040    return latin1;
00041    }
00042 
00043 /*
00044 * Convert from UTF-8 to ISO 8859-1
00045 */
00046 std::string utf8_to_latin1(const std::string& utf8)
00047    {
00048    std::string iso8859;
00049 
00050    size_t position = 0;
00051    while(position != utf8.size())
00052       {
00053       const byte c1 = static_cast<byte>(utf8[position++]);
00054 
00055       if(c1 <= 0x7F)
00056          iso8859 += static_cast<char>(c1);
00057       else if(c1 >= 0xC0 && c1 <= 0xC7)
00058          {
00059          if(position == utf8.size())
00060             throw Decoding_Error("UTF-8: sequence truncated");
00061 
00062          const byte c2 = static_cast<byte>(utf8[position++]);
00063          const byte iso_char = ((c1 & 0x07) << 6) | (c2 & 0x3F);
00064 
00065          if(iso_char <= 0x7F)
00066             throw Decoding_Error("UTF-8: sequence longer than needed");
00067 
00068          iso8859 += static_cast<char>(iso_char);
00069          }
00070       else
00071          throw Decoding_Error("UTF-8: Unicode chars not in Latin1 used");
00072       }
00073 
00074    return iso8859;
00075    }
00076 
00077 /*
00078 * Convert from ISO 8859-1 to UTF-8
00079 */
00080 std::string latin1_to_utf8(const std::string& iso8859)
00081    {
00082    std::string utf8;
00083    for(size_t i = 0; i != iso8859.size(); ++i)
00084       {
00085       const byte c = static_cast<byte>(iso8859[i]);
00086 
00087       if(c <= 0x7F)
00088          utf8 += static_cast<char>(c);
00089       else
00090          {
00091          utf8 += static_cast<char>((0xC0 | (c >> 6)));
00092          utf8 += static_cast<char>((0x80 | (c & 0x3F)));
00093          }
00094       }
00095    return utf8;
00096    }
00097 
00098 }
00099 
00100 /*
00101 * Perform character set transcoding
00102 */
00103 std::string transcode(const std::string& str,
00104                       Character_Set to, Character_Set from)
00105    {
00106    if(to == LOCAL_CHARSET)
00107       to = LATIN1_CHARSET;
00108    if(from == LOCAL_CHARSET)
00109       from = LATIN1_CHARSET;
00110 
00111    if(to == from)
00112       return str;
00113 
00114    if(from == LATIN1_CHARSET && to == UTF8_CHARSET)
00115       return latin1_to_utf8(str);
00116    if(from == UTF8_CHARSET && to == LATIN1_CHARSET)
00117       return utf8_to_latin1(str);
00118    if(from == UCS2_CHARSET && to == LATIN1_CHARSET)
00119       return ucs2_to_latin1(str);
00120 
00121    throw Invalid_Argument("Unknown transcoding operation from " +
00122                           std::to_string(from) + " to " + std::to_string(to));
00123    }
00124 
00125 /*
00126 * Check if a character represents a digit
00127 */
00128 bool is_digit(char c)
00129    {
00130    if(c == '0' || c == '1' || c == '2' || c == '3' || c == '4' ||
00131       c == '5' || c == '6' || c == '7' || c == '8' || c == '9')
00132       return true;
00133    return false;
00134    }
00135 
00136 /*
00137 * Check if a character represents whitespace
00138 */
00139 bool is_space(char c)
00140    {
00141    if(c == ' ' || c == '\t' || c == '\n' || c == '\r')
00142       return true;
00143    return false;
00144    }
00145 
00146 /*
00147 * Convert a character to a digit
00148 */
00149 byte char2digit(char c)
00150    {
00151    switch(c)
00152       {
00153       case '0': return 0;
00154       case '1': return 1;
00155       case '2': return 2;
00156       case '3': return 3;
00157       case '4': return 4;
00158       case '5': return 5;
00159       case '6': return 6;
00160       case '7': return 7;
00161       case '8': return 8;
00162       case '9': return 9;
00163       }
00164 
00165    throw Invalid_Argument("char2digit: Input is not a digit character");
00166    }
00167 
00168 /*
00169 * Convert a digit to a character
00170 */
00171 char digit2char(byte b)
00172    {
00173    switch(b)
00174       {
00175       case 0: return '0';
00176       case 1: return '1';
00177       case 2: return '2';
00178       case 3: return '3';
00179       case 4: return '4';
00180       case 5: return '5';
00181       case 6: return '6';
00182       case 7: return '7';
00183       case 8: return '8';
00184       case 9: return '9';
00185       }
00186 
00187    throw Invalid_Argument("digit2char: Input is not a digit");
00188    }
00189 
00190 /*
00191 * Case-insensitive character comparison
00192 */
00193 bool caseless_cmp(char a, char b)
00194    {
00195    return (std::tolower(static_cast<unsigned char>(a)) ==
00196            std::tolower(static_cast<unsigned char>(b)));
00197    }
00198 
00199 }
00200 
00201 }