pion  5.0.6
src/algorithm.cpp
00001 // ---------------------------------------------------------------------
00002 // pion:  a Boost C++ framework for building lightweight HTTP interfaces
00003 // ---------------------------------------------------------------------
00004 // Copyright (C) 2007-2014 Splunk Inc.  (https://github.com/splunk/pion)
00005 //
00006 // Distributed under the Boost Software License, Version 1.0.
00007 // See http://www.boost.org/LICENSE_1_0.txt
00008 //
00009 
00010 #include <cmath>
00011 #include <cstdlib>
00012 #include <cstdio>
00013 #include <cstring>
00014 #include <pion/algorithm.hpp>
00015 #include <boost/assert.hpp>
00016 
00017 // macro to shift bitmask by a single bit
00018 #define SHIFT_BITMASK(ptr, mask)    if (mask & 0x01) { mask = 0x80; ++ptr; } else mask >>= 1;
00019 
00020 
00021 namespace pion {        // begin namespace pion
00022 
00023 
00024 bool algorithm::base64_decode(const std::string &input, std::string &output)
00025 {
00026     static const char nop = -1; 
00027     static const char decoding_data[] = {
00028         nop,nop,nop,nop, nop,nop,nop,nop, nop,nop,nop,nop, nop,nop,nop,nop,
00029         nop,nop,nop,nop, nop,nop,nop,nop, nop,nop,nop,nop, nop,nop,nop,nop,
00030         nop,nop,nop,nop, nop,nop,nop,nop, nop,nop,nop, 62, nop,nop,nop, 63,
00031         52, 53, 54,  55,  56, 57, 58, 59,  60, 61,nop,nop, nop,nop,nop,nop,
00032         nop, 0,  1,   2,   3,  4,  5,  6,   7,  8,  9, 10,  11, 12, 13, 14,
00033         15, 16, 17,  18,  19, 20, 21, 22,  23, 24, 25,nop, nop,nop,nop,nop,
00034         nop,26, 27,  28,  29, 30, 31, 32,  33, 34, 35, 36,  37, 38, 39, 40,
00035         41, 42, 43,  44,  45, 46, 47, 48,  49, 50, 51,nop, nop,nop,nop,nop,
00036         nop,nop,nop,nop, nop,nop,nop,nop, nop,nop,nop,nop, nop,nop,nop,nop,
00037         nop,nop,nop,nop, nop,nop,nop,nop, nop,nop,nop,nop, nop,nop,nop,nop,
00038         nop,nop,nop,nop, nop,nop,nop,nop, nop,nop,nop,nop, nop,nop,nop,nop,
00039         nop,nop,nop,nop, nop,nop,nop,nop, nop,nop,nop,nop, nop,nop,nop,nop,
00040         nop,nop,nop,nop, nop,nop,nop,nop, nop,nop,nop,nop, nop,nop,nop,nop,
00041         nop,nop,nop,nop, nop,nop,nop,nop, nop,nop,nop,nop, nop,nop,nop,nop,
00042         nop,nop,nop,nop, nop,nop,nop,nop, nop,nop,nop,nop, nop,nop,nop,nop,
00043         nop,nop,nop,nop, nop,nop,nop,nop, nop,nop,nop,nop, nop,nop,nop,nop
00044         };
00045 
00046     unsigned int input_length=input.size();
00047     const char * input_ptr = input.data();
00048 
00049     // allocate space for output string
00050     output.clear();
00051     output.reserve(((input_length+2)/3)*4);
00052 
00053     // for each 4-bytes sequence from the input, extract 4 6-bits sequences by droping first two bits
00054     // and regenerate into 3 8-bits sequence
00055 
00056     for (unsigned int i=0; i<input_length;i++) {
00057         char base64code0;
00058         char base64code1;
00059         char base64code2 = 0;   // initialized to 0 to suppress warnings
00060         char base64code3;
00061 
00062         base64code0 = decoding_data[static_cast<int>(input_ptr[i])];
00063         if(base64code0==nop)            // non base64 character
00064             return false;
00065         if(!(++i<input_length)) // we need at least two input bytes for first byte output
00066             return false;
00067         base64code1 = decoding_data[static_cast<int>(input_ptr[i])];
00068         if(base64code1==nop)            // non base64 character
00069             return false;
00070 
00071         output += ((base64code0 << 2) | ((base64code1 >> 4) & 0x3));
00072 
00073         if(++i<input_length) {
00074             char c = input_ptr[i];
00075             if(c =='=') { // padding , end of input
00076                 BOOST_ASSERT( (base64code1 & 0x0f)==0);
00077                 return true;
00078             }
00079             base64code2 = decoding_data[static_cast<int>(input_ptr[i])];
00080             if(base64code2==nop)            // non base64 character
00081                 return false;
00082 
00083             output += ((base64code1 << 4) & 0xf0) | ((base64code2 >> 2) & 0x0f);
00084         }
00085 
00086         if(++i<input_length) {
00087             char c = input_ptr[i];
00088             if(c =='=') { // padding , end of input
00089                 BOOST_ASSERT( (base64code2 & 0x03)==0);
00090                 return true;
00091             }
00092             base64code3 = decoding_data[static_cast<int>(input_ptr[i])];
00093             if(base64code3==nop)            // non base64 character
00094                 return false;
00095 
00096             output += (((base64code2 << 6) & 0xc0) | base64code3 );
00097         }
00098 
00099     }
00100 
00101     return true;
00102 }
00103 
00104 bool algorithm::base64_encode(const std::string &input, std::string &output)
00105 {
00106     static const char encoding_data[] = 
00107         "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
00108 
00109     unsigned int input_length=input.size();
00110     const char * input_ptr = input.data();
00111 
00112     // allocate space for output string
00113     output.clear();
00114     output.reserve(((input_length+2)/3)*4);
00115 
00116     // for each 3-bytes sequence from the input, extract 4 6-bits sequences and encode using 
00117     // encoding_data lookup table.
00118     // if input do not contains enough chars to complete 3-byte sequence,use pad char '=' 
00119     for (unsigned int i=0; i<input_length;i++) {
00120         int base64code0=0;
00121         int base64code1=0;
00122         int base64code2=0;
00123         int base64code3=0;
00124 
00125         base64code0 = (input_ptr[i] >> 2)  & 0x3f;  // 1-byte 6 bits
00126         output += encoding_data[base64code0];
00127         base64code1 = (input_ptr[i] << 4 ) & 0x3f;  // 1-byte 2 bits +
00128 
00129         if (++i < input_length) {
00130             base64code1 |= (input_ptr[i] >> 4) & 0x0f; // 2-byte 4 bits
00131             output += encoding_data[base64code1];
00132             base64code2 = (input_ptr[i] << 2) & 0x3f;  // 2-byte 4 bits + 
00133 
00134             if (++i < input_length) {
00135                 base64code2 |= (input_ptr[i] >> 6) & 0x03; // 3-byte 2 bits
00136                 base64code3  = input_ptr[i] & 0x3f;       // 3-byte 6 bits
00137                 output += encoding_data[base64code2];
00138                 output += encoding_data[base64code3];
00139             } else {
00140                 output += encoding_data[base64code2];
00141                 output += '=';
00142             }
00143         } else {
00144             output += encoding_data[base64code1];
00145             output += '=';
00146             output += '=';
00147         }
00148     }
00149 
00150     return true;
00151 }
00152 
00153 std::string algorithm::url_decode(const std::string& str)
00154 {
00155     char decode_buf[3];
00156     std::string result;
00157     result.reserve(str.size());
00158     
00159     for (std::string::size_type pos = 0; pos < str.size(); ++pos) {
00160         switch(str[pos]) {
00161         case '+':
00162             // convert to space character
00163             result += ' ';
00164             break;
00165         case '%':
00166             // decode hexadecimal value
00167             if (pos + 2 < str.size()) {
00168                 decode_buf[0] = str[++pos];
00169                 decode_buf[1] = str[++pos];
00170                 decode_buf[2] = '\0';
00171 
00172                 char decoded_char = static_cast<char>( strtol(decode_buf, 0, 16) );
00173 
00174                 // decoded_char will be '\0' if strtol can't parse decode_buf as hex
00175                 // (or if decode_buf == "00", which is also not valid).
00176                 // In this case, recover from error by not decoding.
00177                 if (decoded_char == '\0') {
00178                     result += '%';
00179                     pos -= 2;
00180                 } else
00181                     result += decoded_char;
00182             } else {
00183                 // recover from error by not decoding character
00184                 result += '%';
00185             }
00186             break;
00187         default:
00188             // character does not need to be escaped
00189             result += str[pos];
00190         }
00191     };
00192     
00193     return result;
00194 }
00195     
00196 std::string algorithm::url_encode(const std::string& str)
00197 {
00198     char encode_buf[4];
00199     std::string result;
00200     encode_buf[0] = '%';
00201     result.reserve(str.size());
00202 
00203     // character selection for this algorithm is based on the following url:
00204     // http://www.blooberry.com/indexdot/html/topics/urlencoding.htm
00205     
00206     for (std::string::size_type pos = 0; pos < str.size(); ++pos) {
00207         switch(str[pos]) {
00208         default:
00209             if (str[pos] > 32 && str[pos] < 127) {
00210                 // character does not need to be escaped
00211                 result += str[pos];
00212                 break;
00213             }
00214             // else pass through to next case
00215         case ' ':   
00216         case '$': case '&': case '+': case ',': case '/': case ':':
00217         case ';': case '=': case '?': case '@': case '"': case '<':
00218         case '>': case '#': case '%': case '{': case '}': case '|':
00219         case '\\': case '^': case '~': case '[': case ']': case '`':
00220             // the character needs to be encoded
00221             sprintf(encode_buf+1, "%.2X", (unsigned char)(str[pos]));
00222             result += encode_buf;
00223             break;
00224         }
00225     };
00226     
00227     return result;
00228 }
00229 
00230 // TODO
00231 //std::string algorithm::xml_decode(const std::string& str)
00232 //{
00233 //}
00234 
00235 std::string algorithm::xml_encode(const std::string& str)
00236 {
00237     std::string result;
00238     result.reserve(str.size() + 20);    // Assume ~5 characters converted (length increases)
00239     const unsigned char *ptr = reinterpret_cast<const unsigned char*>(str.c_str());
00240     const unsigned char *end_ptr = ptr + str.size();
00241     while (ptr < end_ptr) {
00242         // check byte ranges for valid UTF-8
00243         // see http://en.wikipedia.org/wiki/UTF-8
00244         // also, see http://www.w3.org/TR/REC-xml/#charsets
00245         // this implementation is the strictest subset of both
00246         if ((*ptr >= 0x20 && *ptr <= 0x7F) || *ptr == 0x9 || *ptr == 0xa || *ptr == 0xd) {
00247             // regular ASCII character
00248             switch(*ptr) {
00249                     // Escape special XML characters.
00250                 case '&':
00251                     result += "&amp;";
00252                     break;
00253                 case '<':
00254                     result += "&lt;";
00255                     break;
00256                 case '>':
00257                     result += "&gt;";
00258                     break;
00259                 case '\"':
00260                     result += "&quot;";
00261                     break;
00262                 case '\'':
00263                     result += "&apos;";
00264                     break;
00265                 default:
00266                     result += *ptr;
00267             }
00268         } else if (*ptr >= 0xC2 && *ptr <= 0xDF) {
00269             // two-byte sequence
00270             if (*(ptr+1) >= 0x80 && *(ptr+1) <= 0xBF) {
00271                 result += *ptr;
00272                 result += *(++ptr);
00273             } else {
00274                 // insert replacement char
00275                 result += 0xef;
00276                 result += 0xbf;
00277                 result += 0xbd;
00278             }
00279         } else if (*ptr >= 0xE0 && *ptr <= 0xEF) {
00280             // three-byte sequence
00281             if (*(ptr+1) >= 0x80 && *(ptr+1) <= 0xBF
00282                 && *(ptr+2) >= 0x80 && *(ptr+2) <= 0xBF) {
00283                 result += *ptr;
00284                 result += *(++ptr);
00285                 result += *(++ptr);
00286             } else {
00287                 // insert replacement char
00288                 result += 0xef;
00289                 result += 0xbf;
00290                 result += 0xbd;
00291             }
00292         } else if (*ptr >= 0xF0 && *ptr <= 0xF4) {
00293             // four-byte sequence
00294             if (*(ptr+1) >= 0x80 && *(ptr+1) <= 0xBF
00295                 && *(ptr+2) >= 0x80 && *(ptr+2) <= 0xBF
00296                 && *(ptr+3) >= 0x80 && *(ptr+3) <= 0xBF) {
00297                 result += *ptr;
00298                 result += *(++ptr);
00299                 result += *(++ptr);
00300                 result += *(++ptr);
00301             } else {
00302                 // insert replacement char
00303                 result += 0xef;
00304                 result += 0xbf;
00305                 result += 0xbd;
00306             }
00307         } else {
00308             // insert replacement char
00309             result += 0xef;
00310             result += 0xbf;
00311             result += 0xbd;
00312         }
00313         ++ptr;
00314     }
00315     
00316     return result;
00317 }
00318 
00319 void algorithm::float_from_bytes(long double& value, const unsigned char *ptr, size_t num_exp_bits, size_t num_fraction_bits)
00320 {
00321     // get sign of the number from the first bit
00322     const int value_sign = (*ptr & 0x80) ? -1 : 1;
00323     
00324     // build exponent value from bitstream
00325     unsigned char mask = 0x80;
00326     boost::int16_t exponent = 0;
00327     for (size_t n = 0; n < num_exp_bits; ++n) {
00328         SHIFT_BITMASK(ptr, mask);
00329         exponent *= 2;
00330         if (*ptr & mask)
00331             exponent += 1;
00332     }
00333     
00334     // build significand from bitstream
00335     long double significand = exponent ? 1.0 : 0.0;
00336     long double significand_value = 1.0;
00337     while (num_fraction_bits) {
00338         SHIFT_BITMASK(ptr, mask);
00339         significand_value /= 2;
00340         if (*ptr & mask)
00341             significand += significand_value;
00342         --num_fraction_bits;
00343     }
00344     
00345     // calculate final value
00346     exponent -= (::pow((long double)2, (int)(num_exp_bits - 1)) - 1);
00347     value = value_sign * significand * ::pow((long double)2, exponent);
00348 }
00349 
00350 void algorithm::float_to_bytes(long double value, unsigned char *buf, size_t num_exp_bits, size_t num_fraction_bits)
00351 {
00352     // first initialize output buffer to zeros
00353     unsigned char *ptr = buf;
00354     memset(ptr, 0x00, ::ceil(static_cast<float>(num_exp_bits + num_fraction_bits + 1) / 8));
00355     
00356     // initialize first byte starting with sign of number
00357     if (value < 0) {
00358         *ptr = 0x80;
00359         value *= -1;
00360     }
00361     
00362     // break down numbers >= 1.0 by incrementing the exponent & dividing by 2
00363     boost::int16_t exponent = 0;
00364     while (value >= 1) {
00365         value /= 2;
00366         ++exponent;
00367     }
00368 
00369     // skip past exponent bits because we don't know the value yet
00370     unsigned char mask = 0x40;
00371     for (size_t n = num_exp_bits; n > 0; --n) {
00372         if (n >= 8) {
00373             ++ptr;
00374             n -= 7;
00375         } else {
00376             SHIFT_BITMASK(ptr, mask);
00377         }
00378     }
00379     
00380     // serialize fractional value < 1.0
00381     bool got_exponent = false;
00382     boost::uint16_t num_bits = 0;
00383     while (value && num_bits < num_fraction_bits) {
00384         value *= 2;
00385         if (got_exponent) {
00386             if (value >= 1.0) {
00387                 *ptr |= mask;
00388                 value -= 1.0;
00389             }
00390             SHIFT_BITMASK(ptr, mask);
00391             ++num_bits;
00392         } else {
00393             --exponent;
00394             if (value >= 1.0) {
00395                 value -= 1.0;
00396                 got_exponent = true;
00397             }
00398         }
00399     }
00400     
00401     // normalize exponent.
00402     // note: we should have a zero exponent if value == 0
00403     boost::int32_t high_bit = ::pow((long double)2, (int)(num_exp_bits - 1));
00404     if (got_exponent)
00405         exponent += (high_bit - 1);
00406     else
00407         exponent = 0;
00408     
00409     // serialize exponent bits
00410     ptr = buf;
00411     mask = 0x80;
00412     for (size_t n = 0; n < num_exp_bits; ++n) {
00413         SHIFT_BITMASK(ptr, mask);
00414         if (exponent >= high_bit) {
00415             *ptr |= mask;
00416             exponent -= high_bit;
00417         }
00418         high_bit /= 2;
00419     }
00420 }
00421     
00422 }   // end namespace pion