libdap
Updated for version 3.17.0
|
00001 00002 // -*- mode: c++; c-basic-offset:4 -*- 00003 00004 // This file is part of libdap, A C++ implementation of the OPeNDAP Data 00005 // Access Protocol. 00006 00007 // Copyright (c) 2002,2003 OPeNDAP, Inc. 00008 // Author: James Gallagher <jgallagher@opendap.org> 00009 // 00010 // This library is free software; you can redistribute it and/or 00011 // modify it under the terms of the GNU Lesser General Public 00012 // License as published by the Free Software Foundation; either 00013 // version 2.1 of the License, or (at your option) any later version. 00014 // 00015 // This library is distributed in the hope that it will be useful, 00016 // but WITHOUT ANY WARRANTY; without even the implied warranty of 00017 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00018 // Lesser General Public License for more details. 00019 // 00020 // You should have received a copy of the GNU Lesser General Public 00021 // License along with this library; if not, write to the Free Software 00022 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 00023 // 00024 // You can contact OPeNDAP, Inc. at PO Box 112, Saunderstown, RI. 02874-0112. 00025 00026 // Copyright (c) 1996, California Institute of Technology. 00027 // ALL RIGHTS RESERVED. U.S. Government Sponsorship acknowledged. 00028 // 00029 // Please read the full copyright notice in the file COPYRIGHT_URI 00030 // in this directory. 00031 // 00032 // Author: Todd Karakashian, NASA/Jet Propulsion Laboratory 00033 // Todd.K.Karakashian@jpl.nasa.gov 00034 // 00035 // $RCSfile: escaping.cc,v $ - Miscellaneous routines for OPeNDAP HDF server 00036 // 00037 // These two routines are for escaping/unescaping strings that are identifiers 00038 // in DAP2 00039 // id2www() -- escape (using WWW hex codes) non-allowable characters in a 00040 // DAP2 identifier 00041 // www2id() -- given an WWW hexcode escaped identifier, restore it 00042 // 00043 // These two routines are for escaping/unescaping strings storing attribute 00044 // values. They use traditional octal escapes (\nnn) because they are 00045 // intended to be viewed by a user 00046 // escattr() -- escape (using traditional octal backslash) non-allowable 00047 // characters in the value of a DAP2 attribute 00048 // unescattr() -- given an octally escaped string, restore it 00049 // 00050 // These are routines used by the above, not intended to be called directly: 00051 // 00052 // hexstring() 00053 // unhexstring() 00054 // octstring() 00055 // unoctstring() 00056 // 00057 // -Todd 00058 00059 #include "config.h" 00060 00061 #include <ctype.h> 00062 00063 #include <iomanip> 00064 #include <string> 00065 #include <sstream> 00066 00067 #include "GNURegex.h" 00068 #include "Error.h" 00069 #include "InternalErr.h" 00070 //#define DODS_DEBUG 00071 #include "debug.h" 00072 00073 using namespace std; 00074 00075 namespace libdap { 00076 00077 // The next four functions were originally defined static, but I removed that 00078 // to make testing them (see generalUtilTest.cc) easier to write. 5/7/2001 00079 // jhrg 00080 00081 string 00082 hexstring(unsigned char val) 00083 { 00084 ostringstream buf; 00085 buf << hex << setw(2) << setfill('0') << static_cast<unsigned int>(val); 00086 00087 return buf.str(); 00088 } 00089 00090 string 00091 unhexstring(string s) 00092 { 00093 int val; 00094 istringstream ss(s); 00095 ss >> hex >> val; 00096 char tmp_str[2]; 00097 tmp_str[0] = static_cast<char>(val); 00098 tmp_str[1] = '\0'; 00099 return string(tmp_str); 00100 } 00101 00102 string 00103 octstring(unsigned char val) 00104 { 00105 ostringstream buf; 00106 buf << oct << setw(3) << setfill('0') 00107 << static_cast<unsigned int>(val); 00108 00109 return buf.str(); 00110 } 00111 00112 string 00113 unoctstring(string s) 00114 { 00115 int val; 00116 00117 istringstream ss(s); 00118 ss >> oct >> val; 00119 00120 DBG(cerr << "unoctstring: " << val << endl); 00121 00122 char tmp_str[2]; 00123 tmp_str[0] = static_cast<char>(val); 00124 tmp_str[1] = '\0'; 00125 return string(tmp_str); 00126 } 00127 00152 string 00153 id2www(string in, const string &allowable) 00154 { 00155 string::size_type i = 0; 00156 DBG(cerr<<"Input string: [" << in << "]" << endl); 00157 while ((i = in.find_first_not_of(allowable, i)) != string::npos) { 00158 DBG(cerr<<"Found escapee: [" << in[i] << "]"); 00159 in.replace(i, 1, "%" + hexstring(in[i])); 00160 DBGN(cerr<<" now the string is: " << in << endl); 00161 i += 3;//i++; 00162 } 00163 00164 return in; 00165 } 00166 00177 string 00178 id2www_ce(string in, const string &allowable) 00179 { 00180 return id2www(in, allowable); 00181 00182 00183 } 00184 00219 string 00220 www2id(const string &in, const string &escape, const string &except) 00221 { 00222 string::size_type i = 0; 00223 string res = in; 00224 while ((i = res.find_first_of(escape, i)) != string::npos) { 00225 if (except.find(res.substr(i, 3)) != string::npos) { 00226 i += 3; 00227 continue; 00228 } 00229 res.replace(i, 3, unhexstring(res.substr(i + 1, 2))); 00230 ++i; 00231 } 00232 00233 return res; 00234 } 00235 00236 static string 00237 entity(char c) 00238 { 00239 switch (c) { 00240 case '>': return ">"; 00241 case '<': return "<"; 00242 case '&': return "&"; 00243 case '\'': return "'"; 00244 case '\"': return """; 00245 default: 00246 throw InternalErr(__FILE__, __LINE__, "Unrecognized character."); 00247 } 00248 } 00249 00250 // Assumption: There are always exactly two octal digits in the input 00251 // and two hex digits in the result. 00252 string 00253 octal_to_hex(const string &octal_digits) 00254 { 00255 int val; 00256 00257 istringstream ss(octal_digits); 00258 ss >> oct >> val; 00259 00260 ostringstream ds; 00261 ds << hex << setw(2) << setfill('0') << val; 00262 return ds.str(); 00263 } 00264 00271 string 00272 id2xml(string in, const string ¬_allowed) 00273 { 00274 string::size_type i = 0; 00275 00276 while ((i = in.find_first_of(not_allowed, i)) != string::npos) { 00277 in.replace(i, 1, entity(in[i])); 00278 ++i; 00279 } 00280 #if 0 00281 // Removed the encoding of octal escapes. This function is used by 00282 // AttrTable to encode the stuff that is the value of the <value> 00283 // element in the DDX. The problem is that some of the values are not 00284 // valid UTF-8 and that makes a XML parser gag.; ticket 1512. 00285 // jhrg 3/19/10 00286 00287 // OK, now scan for octal escape sequences like \\012 (where the '\' 00288 // is itself escaped). This type of attribute value comes from the netCDF 00289 // handler and maybe others. Assumption: The '\' will always appear as 00290 // in its escaped form: '\\'. NB: Both backslashes must be escaped in the 00291 // C++ string. 00292 string octal_escape = "\\\\"; 00293 i = 0; 00294 string::size_type length = in.length(); 00295 while ((i = in.find(octal_escape, i)) != string::npos) { 00296 // Get the three octal digits following the '\\0' 00297 string::size_type j = i + 2; 00298 if (j + 1 >= length) // Check that we're not past the end 00299 break; 00300 string octal_digits = in.substr(j, 3); 00301 // convert to a Ý XML escape 00302 string hex_escape = string("&#x"); 00303 hex_escape.append(octal_to_hex(octal_digits)); 00304 hex_escape.append(string(";")); 00305 00306 // replace the octal escape with an XML/hex escape 00307 in.replace(i, 5, hex_escape); 00308 00309 // increment i 00310 i += 6; 00311 } 00312 #endif 00313 return in; 00314 } 00315 00321 string 00322 xml2id(string in) 00323 { 00324 string::size_type i = 0; 00325 00326 while ((i = in.find(">", i)) != string::npos) 00327 in.replace(i, 4, ">"); 00328 00329 i = 0; 00330 while ((i = in.find("<", i)) != string::npos) 00331 in.replace(i, 4, "<"); 00332 00333 i = 0; 00334 while ((i = in.find("&", i)) != string::npos) 00335 in.replace(i, 5, "&"); 00336 00337 i = 0; 00338 while ((i = in.find("'", i)) != string::npos) 00339 in.replace(i, 6, "'"); 00340 00341 i = 0; 00342 while ((i = in.find(""", i)) != string::npos) 00343 in.replace(i, 6, "\""); 00344 00345 return in; 00346 } 00347 00353 string 00354 esc2underscore(string s) 00355 { 00356 string::size_type pos; 00357 while ((pos = s.find('%')) != string::npos) 00358 s.replace(pos, 3, "_"); 00359 00360 return s; 00361 } 00362 00363 00367 string 00368 escattr(string s) 00369 { 00370 const string printable = " ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789~`!@#$%^&*()_-+={[}]|\\:;<,>.?/'\""; 00371 const string ESC = "\\"; 00372 const string DOUBLE_ESC = ESC + ESC; 00373 const string QUOTE = "\""; 00374 const string ESCQUOTE = ESC + QUOTE; 00375 00376 // escape \ with a second backslash 00377 string::size_type ind = 0; 00378 while ((ind = s.find(ESC, ind)) != s.npos) { 00379 s.replace(ind, 1, DOUBLE_ESC); 00380 ind += DOUBLE_ESC.length(); 00381 } 00382 00383 // escape non-printing characters with octal escape 00384 ind = 0; 00385 while ((ind = s.find_first_not_of(printable, ind)) != s.npos) 00386 s.replace(ind, 1, ESC + octstring(s[ind])); 00387 00388 // escape " with backslash 00389 ind = 0; 00390 while ((ind = s.find(QUOTE, ind)) != s.npos) { 00391 s.replace(ind, 1, ESCQUOTE); 00392 ind += ESCQUOTE.length(); 00393 } 00394 00395 return s; 00396 } 00397 00406 string 00407 unescattr(string s) 00408 { 00409 Regex octal("\\\\[0-3][0-7][0-7]"); // matches 4 characters 00410 Regex esc_quote("\\\\\""); // matches 3 characters 00411 Regex esc_esc("\\\\\\\\"); // matches 2 characters 00412 const string ESC = "\\"; 00413 const string QUOTE = "\""; 00414 int matchlen; 00415 unsigned int index; 00416 00417 DBG(cerr << "0XX" << s << "XXX" << endl); 00418 // unescape any escaped backslashes 00419 index = esc_esc.search(s.c_str(), s.length(), matchlen, 0); 00420 while (index < s.length()) { 00421 DBG(cerr << "1aXX" << s << "XXX index: " << index << endl); 00422 s.replace(index, 2, ESC); 00423 DBG(cerr << "1bXX" << s << "XXX index: " << index << endl); 00424 index = esc_esc.search(s.c_str(), s.length(), matchlen, 0); 00425 } 00426 00427 // unescape any escaped double quote characters 00428 index = esc_quote.search(s.c_str(), s.length(), matchlen, 0); 00429 while (index < s.length()) { 00430 s.replace(index, 2, QUOTE); 00431 DBG(cerr << "2XX" << s << "XXX index: " << index << endl); 00432 index = esc_quote.search(s.c_str(), s.length(), matchlen, 0); 00433 } 00434 00435 // unescape octal characters 00436 index = octal.search(s.c_str(), s.length(), matchlen, 0); 00437 while (index < s.length()) { 00438 s.replace(index, 4, unoctstring(s.substr(index + 1, 3))); 00439 DBG(cerr << "3XX" << s << "XXX index: " << index << endl); 00440 index = octal.search(s.c_str(), s.length(), matchlen, 0); 00441 } 00442 00443 DBG(cerr << "4XX" << s << "XXX" << endl); 00444 return s; 00445 } 00446 00447 string 00448 munge_error_message(string msg) 00449 { 00450 // First, add enclosing quotes if needed. 00451 if (*msg.begin() != '"') 00452 msg.insert(msg.begin(), '"'); 00453 if (*(msg.end() - 1) != '"') 00454 msg += "\""; 00455 00456 // Now escape any internal double quotes that aren't escaped. 00457 string::iterator miter; 00458 for (miter = msg.begin() + 1; miter != msg.end() - 1; miter++) 00459 if (*miter == '"' && *(miter - 1) != '\\') 00460 miter = msg.insert(miter, '\\'); 00461 00462 return msg; 00463 } 00464 00469 string 00470 escape_double_quotes(string source) 00471 { 00472 string::size_type idx = 0; 00473 while((idx = source.find('\"', idx)) != string::npos) { 00474 source.replace(idx, 1, "\\\""); // a backslash and a double quote 00475 idx += 2; 00476 } 00477 00478 return source; 00479 } 00480 00486 string 00487 unescape_double_quotes(string source) 00488 { 00489 string::size_type idx = 0; 00490 while((idx = source.find("\\\"", idx)) != string::npos) { 00491 source.replace(idx, 2, "\""); // a backslash and a double quote 00492 ++idx; 00493 } 00494 00495 return source; 00496 } 00497 00498 } // namespace libdap 00499