libdap  Updated for version 3.17.0
escaping.cc
00001 
00002 // -*- mode: c++; c-basic-offset:4 -*-
00003 
00004 // This file is part of libdap, A C++ implementation of the OPeNDAP Data
00005 // Access Protocol.
00006 
00007 // Copyright (c) 2002,2003 OPeNDAP, Inc.
00008 // Author: James Gallagher <jgallagher@opendap.org>
00009 //
00010 // This library is free software; you can redistribute it and/or
00011 // modify it under the terms of the GNU Lesser General Public
00012 // License as published by the Free Software Foundation; either
00013 // version 2.1 of the License, or (at your option) any later version.
00014 //
00015 // This library is distributed in the hope that it will be useful,
00016 // but WITHOUT ANY WARRANTY; without even the implied warranty of
00017 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00018 // Lesser General Public License for more details.
00019 //
00020 // You should have received a copy of the GNU Lesser General Public
00021 // License along with this library; if not, write to the Free Software
00022 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
00023 //
00024 // You can contact OPeNDAP, Inc. at PO Box 112, Saunderstown, RI. 02874-0112.
00025 
00026 // Copyright (c) 1996, California Institute of Technology.
00027 // ALL RIGHTS RESERVED.   U.S. Government Sponsorship acknowledged.
00028 //
00029 // Please read the full copyright notice in the file COPYRIGHT_URI
00030 // in this directory.
00031 //
00032 // Author: Todd Karakashian, NASA/Jet Propulsion Laboratory
00033 //         Todd.K.Karakashian@jpl.nasa.gov
00034 //
00035 // $RCSfile: escaping.cc,v $ - Miscellaneous routines for OPeNDAP HDF server
00036 //
00037 // These two routines are for escaping/unescaping strings that are identifiers
00038 // in DAP2
00039 // id2www() -- escape (using WWW hex codes) non-allowable characters in a
00040 // DAP2 identifier
00041 // www2id() -- given an WWW hexcode escaped identifier, restore it
00042 //
00043 // These two routines are for escaping/unescaping strings storing attribute
00044 // values.  They use traditional octal escapes (\nnn) because they are
00045 // intended to be viewed by a user
00046 // escattr() -- escape (using traditional octal backslash) non-allowable
00047 // characters in the value of a DAP2 attribute
00048 // unescattr() -- given an octally escaped string, restore it
00049 //
00050 // These are routines used by the above, not intended to be called directly:
00051 //
00052 // hexstring()
00053 // unhexstring()
00054 // octstring()
00055 // unoctstring()
00056 //
00057 // -Todd
00058 
00059 #include "config.h"
00060 
00061 #include <ctype.h>
00062 
00063 #include <iomanip>
00064 #include <string>
00065 #include <sstream>
00066 
00067 #include "GNURegex.h"
00068 #include "Error.h"
00069 #include "InternalErr.h"
00070 //#define DODS_DEBUG
00071 #include "debug.h"
00072 
00073 using namespace std;
00074 
00075 namespace libdap {
00076 
00077 // The next four functions were originally defined static, but I removed that
00078 // to make testing them (see generalUtilTest.cc) easier to write. 5/7/2001
00079 // jhrg
00080 
00081 string
00082 hexstring(unsigned char val)
00083 {
00084     ostringstream buf;
00085     buf << hex << setw(2) << setfill('0') << static_cast<unsigned int>(val);
00086 
00087     return buf.str();
00088 }
00089 
00090 string
00091 unhexstring(string s)
00092 {
00093     int val;
00094     istringstream ss(s);
00095     ss >> hex >> val;
00096     char tmp_str[2];
00097     tmp_str[0] = static_cast<char>(val);
00098     tmp_str[1] = '\0';
00099     return string(tmp_str);
00100 }
00101 
00102 string
00103 octstring(unsigned char val)
00104 {
00105     ostringstream buf;
00106     buf << oct << setw(3) << setfill('0')
00107     << static_cast<unsigned int>(val);
00108 
00109     return buf.str();
00110 }
00111 
00112 string
00113 unoctstring(string s)
00114 {
00115     int val;
00116 
00117     istringstream ss(s);
00118     ss >> oct >> val;
00119 
00120     DBG(cerr << "unoctstring: " << val << endl);
00121 
00122     char tmp_str[2];
00123     tmp_str[0] = static_cast<char>(val);
00124     tmp_str[1] = '\0';
00125     return string(tmp_str);
00126 }
00127 
00152 string
00153 id2www(string in, const string &allowable)
00154 {
00155     string::size_type i = 0;
00156     DBG(cerr<<"Input string: [" << in << "]" << endl);
00157     while ((i = in.find_first_not_of(allowable, i)) != string::npos) {
00158         DBG(cerr<<"Found escapee: [" << in[i] << "]");
00159         in.replace(i, 1, "%" + hexstring(in[i]));
00160         DBGN(cerr<<" now the string is: " << in << endl);
00161         i += 3;//i++;
00162     }
00163 
00164     return in;
00165 }
00166 
00177 string
00178 id2www_ce(string in, const string &allowable)
00179 {
00180     return id2www(in, allowable);
00181 
00182 
00183 }
00184 
00219 string
00220 www2id(const string &in, const string &escape, const string &except)
00221 {
00222     string::size_type i = 0;
00223     string res = in;
00224     while ((i = res.find_first_of(escape, i)) != string::npos) {
00225         if (except.find(res.substr(i, 3)) != string::npos) {
00226             i += 3;
00227             continue;
00228         }
00229         res.replace(i, 3, unhexstring(res.substr(i + 1, 2)));
00230         ++i;
00231     }
00232 
00233     return res;
00234 }
00235 
00236 static string
00237 entity(char c)
00238 {
00239     switch (c) {
00240     case '>': return "&gt;";
00241     case '<': return "&lt;";
00242     case '&': return "&amp;";
00243     case '\'': return "&apos;";
00244     case '\"': return "&quot;";
00245     default:
00246         throw InternalErr(__FILE__, __LINE__, "Unrecognized character.");
00247     }
00248 }
00249 
00250 // Assumption: There are always exactly two octal digits in the input
00251 // and two hex digits in the result.
00252 string
00253 octal_to_hex(const string &octal_digits)
00254 {
00255     int val;
00256 
00257     istringstream ss(octal_digits);
00258     ss >> oct >> val;
00259 
00260     ostringstream ds;
00261     ds << hex << setw(2) << setfill('0') << val;
00262     return ds.str();
00263 }
00264 
00271 string
00272 id2xml(string in, const string &not_allowed)
00273 {
00274     string::size_type i = 0;
00275 
00276     while ((i = in.find_first_of(not_allowed, i)) != string::npos) {
00277         in.replace(i, 1, entity(in[i]));
00278         ++i;
00279     }
00280 #if 0
00281     // Removed the encoding of octal escapes. This function is used by
00282     // AttrTable to encode the stuff that is the value of the <value>
00283     // element in the DDX. The problem is that some of the values are not
00284     // valid UTF-8 and that makes a XML parser gag.; ticket 1512.
00285     // jhrg 3/19/10
00286 
00287     // OK, now scan for octal escape sequences like \\012 (where the '\'
00288     // is itself escaped). This type of attribute value comes from the netCDF
00289     // handler and maybe others. Assumption: The '\' will always appear as
00290     // in its escaped form: '\\'. NB: Both backslashes must be escaped in the
00291     // C++ string.
00292     string octal_escape = "\\\\";
00293     i = 0;
00294     string::size_type length = in.length();
00295     while ((i = in.find(octal_escape, i)) != string::npos) {
00296         // Get the three octal digits following the '\\0'
00297         string::size_type j = i + 2;
00298         if (j + 1 >= length)  // Check that we're not past the end
00299             break;
00300         string octal_digits = in.substr(j, 3);
00301         // convert to a &#xdd; XML escape
00302         string hex_escape = string("&#x");
00303         hex_escape.append(octal_to_hex(octal_digits));
00304         hex_escape.append(string(";"));
00305 
00306         // replace the octal escape with an XML/hex escape
00307         in.replace(i, 5, hex_escape);
00308 
00309         // increment i
00310         i += 6;
00311     }
00312 #endif
00313     return in;
00314 }
00315 
00321 string
00322 xml2id(string in)
00323 {
00324     string::size_type i = 0;
00325 
00326     while ((i = in.find("&gt;", i)) != string::npos)
00327         in.replace(i, 4, ">");
00328 
00329     i = 0;
00330     while ((i = in.find("&lt;", i)) != string::npos)
00331         in.replace(i, 4, "<");
00332 
00333     i = 0;
00334     while ((i = in.find("&amp;", i)) != string::npos)
00335         in.replace(i, 5, "&");
00336 
00337     i = 0;
00338     while ((i = in.find("&apos;", i)) != string::npos)
00339         in.replace(i, 6, "'");
00340 
00341     i = 0;
00342     while ((i = in.find("&quot;", i)) != string::npos)
00343         in.replace(i, 6, "\"");
00344 
00345     return in;
00346 }
00347 
00353 string
00354 esc2underscore(string s)
00355 {
00356     string::size_type pos;
00357     while ((pos = s.find('%')) != string::npos)
00358         s.replace(pos, 3, "_");
00359 
00360     return s;
00361 }
00362 
00363 
00367 string
00368 escattr(string s)
00369 {
00370     const string printable = " ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789~`!@#$%^&*()_-+={[}]|\\:;<,>.?/'\"";
00371     const string ESC = "\\";
00372     const string DOUBLE_ESC = ESC + ESC;
00373     const string QUOTE = "\"";
00374     const string ESCQUOTE = ESC + QUOTE;
00375 
00376     // escape \ with a second backslash
00377     string::size_type ind = 0;
00378     while ((ind = s.find(ESC, ind)) != s.npos) {
00379         s.replace(ind, 1, DOUBLE_ESC);
00380         ind += DOUBLE_ESC.length();
00381     }
00382 
00383     // escape non-printing characters with octal escape
00384     ind = 0;
00385     while ((ind = s.find_first_not_of(printable, ind)) != s.npos)
00386         s.replace(ind, 1, ESC + octstring(s[ind]));
00387 
00388     // escape " with backslash
00389     ind = 0;
00390     while ((ind = s.find(QUOTE, ind)) != s.npos) {
00391         s.replace(ind, 1, ESCQUOTE);
00392         ind += ESCQUOTE.length();
00393     }
00394 
00395     return s;
00396 }
00397 
00406 string
00407 unescattr(string s)
00408 {
00409     Regex octal("\\\\[0-3][0-7][0-7]");  // matches 4 characters
00410     Regex esc_quote("\\\\\"");  // matches 3 characters
00411     Regex esc_esc("\\\\\\\\");      // matches 2 characters
00412     const string ESC = "\\";
00413     const string QUOTE = "\"";
00414     int matchlen;
00415     unsigned int index;
00416 
00417     DBG(cerr << "0XX" << s << "XXX" << endl);
00418     // unescape any escaped backslashes
00419     index = esc_esc.search(s.c_str(), s.length(), matchlen, 0);
00420     while (index < s.length()) {
00421         DBG(cerr << "1aXX" << s << "XXX index: " << index << endl);
00422         s.replace(index, 2, ESC);
00423         DBG(cerr << "1bXX" << s << "XXX index: " << index << endl);
00424         index = esc_esc.search(s.c_str(), s.length(), matchlen, 0);
00425     }
00426 
00427     // unescape any escaped double quote characters
00428     index = esc_quote.search(s.c_str(), s.length(), matchlen, 0);
00429     while (index < s.length()) {
00430         s.replace(index, 2, QUOTE);
00431         DBG(cerr << "2XX" << s << "XXX index: " << index << endl);
00432         index = esc_quote.search(s.c_str(), s.length(), matchlen, 0);
00433     }
00434 
00435     // unescape octal characters
00436     index = octal.search(s.c_str(), s.length(), matchlen, 0);
00437     while (index < s.length()) {
00438         s.replace(index, 4, unoctstring(s.substr(index + 1, 3)));
00439         DBG(cerr << "3XX" << s << "XXX index: " << index << endl);
00440         index = octal.search(s.c_str(), s.length(), matchlen, 0);
00441     }
00442 
00443     DBG(cerr << "4XX" << s << "XXX" << endl);
00444     return s;
00445 }
00446 
00447 string
00448 munge_error_message(string msg)
00449 {
00450     // First, add enclosing quotes if needed.
00451     if (*msg.begin() != '"')
00452         msg.insert(msg.begin(), '"');
00453     if (*(msg.end() - 1) != '"')
00454         msg += "\"";
00455 
00456     // Now escape any internal double quotes that aren't escaped.
00457     string::iterator miter;
00458     for (miter = msg.begin() + 1; miter != msg.end() - 1; miter++)
00459         if (*miter == '"' && *(miter - 1) != '\\')
00460             miter = msg.insert(miter, '\\');
00461 
00462     return msg;
00463 }
00464 
00469 string
00470 escape_double_quotes(string source)
00471 {
00472     string::size_type idx = 0;
00473     while((idx = source.find('\"', idx)) != string::npos) {
00474         source.replace(idx, 1, "\\\""); // a backslash and a double quote
00475         idx += 2;
00476     }
00477 
00478     return source;
00479 }
00480 
00486 string
00487 unescape_double_quotes(string source)
00488 {
00489     string::size_type idx = 0;
00490     while((idx = source.find("\\\"", idx)) != string::npos) {
00491         source.replace(idx, 2, "\""); // a backslash and a double quote
00492         ++idx;
00493     }
00494 
00495     return source;
00496 }
00497 
00498 } // namespace libdap
00499