libdap  Updated for version 3.17.0
HTTPConnect.cc
00001 
00002 // -*- mode: c++; c-basic-offset:4 -*-
00003 
00004 // This file is part of libdap, A C++ implementation of the OPeNDAP Data
00005 // Access Protocol.
00006 
00007 // Copyright (c) 2002,2003 OPeNDAP, Inc.
00008 // Author: James Gallagher <jgallagher@opendap.org>
00009 //
00010 // This library is free software; you can redistribute it and/or
00011 // modify it under the terms of the GNU Lesser General Public
00012 // License as published by the Free Software Foundation; either
00013 // version 2.1 of the License, or (at your option) any later version.
00014 //
00015 // This library is distributed in the hope that it will be useful,
00016 // but WITHOUT ANY WARRANTY; without even the implied warranty of
00017 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00018 // Lesser General Public License for more details.
00019 //
00020 // You should have received a copy of the GNU Lesser General Public
00021 // License along with this library; if not, write to the Free Software
00022 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
00023 //
00024 // You can contact OPeNDAP, Inc. at PO Box 112, Saunderstown, RI. 02874-0112.
00025 
00026 
00027 #include "config.h"
00028 
00029 #ifdef HAVE_UNISTD_H
00030 #include <unistd.h>
00031 #endif
00032 
00033 #include <sys/stat.h>
00034 
00035 #ifdef WIN32
00036 #include <io.h>
00037 #endif
00038 
00039 #include <string>
00040 #include <vector>
00041 #include <functional>
00042 #include <algorithm>
00043 #include <sstream>
00044 #include <fstream>
00045 #include <iterator>
00046 #include <cstdlib>
00047 #include <cstring>
00048 #include <cerrno>
00049 
00050 //#define DODS_DEBUG2
00051 //#define HTTP_TRACE
00052 //#define DODS_DEBUG
00053 
00054 #undef USE_GETENV
00055 
00056 
00057 #include "debug.h"
00058 #include "mime_util.h"
00059 #include "media_types.h"
00060 #include "GNURegex.h"
00061 #include "HTTPCache.h"
00062 #include "HTTPConnect.h"
00063 #include "RCReader.h"
00064 #include "HTTPResponse.h"
00065 #include "HTTPCacheResponse.h"
00066 
00067 using namespace std;
00068 
00069 namespace libdap {
00070 
00071 // These global variables are not MT-Safe, but I'm leaving them as is because
00072 // they are used only for debugging (set them in a debugger like gdb or ddd).
00073 // They are not static because I think that many debuggers cannot access
00074 // static variables. 08/07/02 jhrg
00075 
00076 // Set this to 1 to turn on libcurl's verbose mode (for debugging).
00077 int www_trace = 0;
00078 
00079 // Keep the temporary files; useful for debugging.
00080 int dods_keep_temps = 0;
00081 
00082 #define CLIENT_ERR_MIN 400
00083 #define CLIENT_ERR_MAX 417
00084 static const char *http_client_errors[CLIENT_ERR_MAX - CLIENT_ERR_MIN +1] =
00085     {
00086         "Bad Request:",
00087         "Unauthorized: Contact the server administrator.",
00088         "Payment Required.",
00089         "Forbidden: Contact the server administrator.",
00090         "Not Found: The data source or server could not be found.\n\
00091         Often this means that the OPeNDAP server is missing or needs attention;\n\
00092         Please contact the server administrator.",
00093         "Method Not Allowed.",
00094         "Not Acceptable.",
00095         "Proxy Authentication Required.",
00096         "Request Time-out.",
00097         "Conflict.",
00098         "Gone:.",
00099         "Length Required.",
00100         "Precondition Failed.",
00101         "Request Entity Too Large.",
00102         "Request URI Too Large.",
00103         "Unsupported Media Type.",
00104         "Requested Range Not Satisfiable.",
00105         "Expectation Failed."
00106     };
00107 
00108 #define SERVER_ERR_MIN 500
00109 #define SERVER_ERR_MAX 505
00110 static const char *http_server_errors[SERVER_ERR_MAX - SERVER_ERR_MIN + 1] =
00111     {
00112         "Internal Server Error.",
00113         "Not Implemented.",
00114         "Bad Gateway.",
00115         "Service Unavailable.",
00116         "Gateway Time-out.",
00117         "HTTP Version Not Supported."
00118     };
00119 
00122 static string
00123 http_status_to_string(int status)
00124 {
00125     if (status >= CLIENT_ERR_MIN && status <= CLIENT_ERR_MAX)
00126         return string(http_client_errors[status - CLIENT_ERR_MIN]);
00127     else if (status >= SERVER_ERR_MIN && status <= SERVER_ERR_MAX)
00128         return string(http_server_errors[status - SERVER_ERR_MIN]);
00129     else
00130         return string("Unknown Error: This indicates a problem with libdap++.\nPlease report this to support@opendap.org.");
00131 }
00132 
00133 static ObjectType
00134 determine_object_type(const string &header_value)
00135 {
00136     // DAP4 Data: application/vnd.opendap.dap4.data
00137     // DAP4 DMR: application/vnd.opendap.dap4.dataset-metadata+xml
00138 
00139     string::size_type plus = header_value.find('+');
00140     string base_type;
00141     string type_extension = "";
00142     if (plus != string::npos) {
00143         base_type= header_value.substr(0, plus);
00144         type_extension = header_value.substr(plus+1);
00145     }
00146     else
00147         base_type = header_value;
00148 
00149     if (base_type == DMR_Content_Type
00150         || (base_type.find("application/") != string::npos
00151                 && base_type.find("dap4.dataset-metadata") != string::npos)) {
00152         if (type_extension == "xml")
00153             return dap4_dmr;
00154         else
00155             return unknown_type;
00156     }
00157     else if (base_type == DAP4_DATA_Content_Type
00158                 || (base_type.find("application/") != string::npos
00159                         && base_type.find("dap4.data") != string::npos)) {
00160         return dap4_data;
00161     }
00162     else if (header_value.find("text/html") != string::npos) {
00163         return web_error;
00164     }
00165     else
00166         return unknown_type;
00167 }
00168 
00173 class ParseHeader : public unary_function<const string &, void>
00174 {
00175     ObjectType type;  // What type of object is in the stream?
00176     string server;  // Server's version string.
00177     string protocol;            // Server's protocol version.
00178     string location;            // Url returned by server
00179 
00180 public:
00181     ParseHeader() : type(unknown_type), server("dods/0.0"), protocol("2.0")
00182     { }
00183 
00184     void operator()(const string &line)
00185     {
00186         string name, value;
00187         parse_mime_header(line, name, value);
00188 
00189         DBG2(cerr << name << ": " << value << endl);
00190 
00191         // Content-Type is used to determine the content of DAP4 responses, but allow the
00192         // Content-Description header to override CT o preserve operation with DAP2 servers.
00193         // jhrg 11/12/13
00194         if (type == unknown_type && name == "content-type") {
00195             type = determine_object_type(value); // see above
00196         }
00197         if (name == "content-description" && !(type == dap4_dmr || type == dap4_data || type == dap4_error)) {
00198             type = get_description_type(value); // defined in mime_util.cc
00199         }
00200         // The second test (== "dods/0.0") tests if xopendap-server has already
00201         // been seen. If so, use that header in preference to the old
00202         // XDODS-Server header. jhrg 2/7/06
00203         else if (name == "xdods-server" && server == "dods/0.0") {
00204             server = value;
00205         }
00206         else if (name == "xopendap-server") {
00207             server = value;
00208         }
00209         else if (name == "xdap") {
00210             protocol = value;
00211         }
00212         else if (server == "dods/0.0" && name == "server") {
00213             server = value;
00214         }
00215         else if (name == "location") {
00216             location = value;
00217         }
00218     }
00219 
00220     ObjectType get_object_type()
00221     {
00222         return type;
00223     }
00224 
00225     string get_server()
00226     {
00227         return server;
00228     }
00229 
00230     string get_protocol()
00231     {
00232         return protocol;
00233     }
00234 
00235     string get_location() {
00236            return location;
00237     }
00238 };
00239 
00255 static size_t
00256 save_raw_http_headers(void *ptr, size_t size, size_t nmemb, void *resp_hdrs)
00257 {
00258     DBG2(cerr << "Inside the header parser." << endl);
00259     vector<string> *hdrs = static_cast<vector<string> * >(resp_hdrs);
00260 
00261     // Grab the header, minus the trailing newline. Or \r\n pair.
00262     string complete_line;
00263     if (nmemb > 1 && *(static_cast<char*>(ptr) + size * (nmemb - 2)) == '\r')
00264         complete_line.assign(static_cast<char *>(ptr), size * (nmemb - 2));
00265     else
00266         complete_line.assign(static_cast<char *>(ptr), size * (nmemb - 1));
00267 
00268     // Store all non-empty headers that are not HTTP status codes
00269     if (complete_line != "" && complete_line.find("HTTP") == string::npos) {
00270         DBG(cerr << "Header line: " << complete_line << endl);
00271         hdrs->push_back(complete_line);
00272     }
00273 
00274     return size * nmemb;
00275 }
00276 
00278 static int
00279 curl_debug(CURL *, curl_infotype info, char *msg, size_t size, void  *)
00280 {
00281     string message(msg, size);
00282 
00283     switch (info) {
00284     case CURLINFO_TEXT:
00285         cerr << "Text: " << message; break;
00286     case CURLINFO_HEADER_IN:
00287         cerr << "Header in: " << message; break;
00288     case CURLINFO_HEADER_OUT:
00289         cerr << "Header out: " << message; break;
00290     case CURLINFO_DATA_IN:
00291         cerr << "Data in: " << message; break;
00292     case CURLINFO_DATA_OUT:
00293         cerr << "Data out: " << message; break;
00294     case CURLINFO_END:
00295         cerr << "End: " << message; break;
00296 #ifdef CURLINFO_SSL_DATA_IN
00297     case CURLINFO_SSL_DATA_IN:
00298         cerr << "SSL Data in: " << message; break;
00299 #endif
00300 #ifdef CURLINFO_SSL_DATA_OUT
00301     case CURLINFO_SSL_DATA_OUT:
00302         cerr << "SSL Data out: " << message; break;
00303 #endif
00304     default:
00305         cerr << "Curl info: " << message; break;
00306     }
00307     return 0;
00308 }
00309 
00313 void
00314 HTTPConnect::www_lib_init()
00315 {
00316     d_curl = curl_easy_init();
00317     if (!d_curl)
00318         throw InternalErr(__FILE__, __LINE__, "Could not initialize libcurl.");
00319 
00320     // Now set options that will remain constant for the duration of this
00321     // CURL object.
00322 
00323     // Set the proxy host.
00324     if (!d_rcr->get_proxy_server_host().empty()) {
00325         DBG(cerr << "Setting up a proxy server." << endl);
00326         DBG(cerr << "Proxy host: " << d_rcr->get_proxy_server_host()
00327             << endl);
00328         DBG(cerr << "Proxy port: " << d_rcr->get_proxy_server_port()
00329             << endl);
00330         DBG(cerr << "Proxy pwd : " << d_rcr->get_proxy_server_userpw()
00331             << endl);
00332         curl_easy_setopt(d_curl, CURLOPT_PROXY,
00333                          d_rcr->get_proxy_server_host().c_str());
00334         curl_easy_setopt(d_curl, CURLOPT_PROXYPORT,
00335                          d_rcr->get_proxy_server_port());
00336 
00337         // As of 4/21/08 only NTLM, Digest and Basic work.
00338 #ifdef CURLOPT_PROXYAUTH
00339         curl_easy_setopt(d_curl, CURLOPT_PROXYAUTH, (long)CURLAUTH_ANY);
00340 #endif
00341 
00342         // Password might not be required. 06/21/04 jhrg
00343         if (!d_rcr->get_proxy_server_userpw().empty())
00344             curl_easy_setopt(d_curl, CURLOPT_PROXYUSERPWD,
00345                              d_rcr->get_proxy_server_userpw().c_str());
00346     }
00347 
00348     curl_easy_setopt(d_curl, CURLOPT_ERRORBUFFER, d_error_buffer);
00349     // We have to set FailOnError to false for any of the non-Basic
00350     // authentication schemes to work. 07/28/03 jhrg
00351     curl_easy_setopt(d_curl, CURLOPT_FAILONERROR, 0);
00352 
00353     // This means libcurl will use Basic, Digest, GSS Negotiate, or NTLM,
00354     // choosing the the 'safest' one supported by the server.
00355     // This requires curl 7.10.6 which is still in pre-release. 07/25/03 jhrg
00356     curl_easy_setopt(d_curl, CURLOPT_HTTPAUTH, (long)CURLAUTH_ANY);
00357 
00358     curl_easy_setopt(d_curl, CURLOPT_NOPROGRESS, 1);
00359     curl_easy_setopt(d_curl, CURLOPT_NOSIGNAL, 1);
00360     curl_easy_setopt(d_curl, CURLOPT_HEADERFUNCTION, save_raw_http_headers);
00361     // In read_url a call to CURLOPT_WRITEHEADER is used to set the fourth
00362     // param of save_raw_http_headers to a vector<string> object.
00363 
00364     // Follow 302 (redirect) responses
00365     curl_easy_setopt(d_curl, CURLOPT_FOLLOWLOCATION, 1);
00366     curl_easy_setopt(d_curl, CURLOPT_MAXREDIRS, 5);
00367 
00368     // If the user turns off SSL validation...
00369     if (d_rcr->get_validate_ssl() == 0) {
00370         curl_easy_setopt(d_curl, CURLOPT_SSL_VERIFYPEER, 0);
00371         curl_easy_setopt(d_curl, CURLOPT_SSL_VERIFYHOST, 0);
00372     }
00373 
00374     // Look to see if cookies are turned on in the .dodsrc file. If so,
00375     // activate here. We honor 'session cookies' (cookies without an
00376     // expiration date) here so that session-based SSO systems will work as
00377     // expected.
00378     if (!d_cookie_jar.empty()) {
00379         DBG(cerr << "Setting the cookie jar to: " << d_cookie_jar << endl);
00380         curl_easy_setopt(d_curl, CURLOPT_COOKIEJAR, d_cookie_jar.c_str());
00381         curl_easy_setopt(d_curl, CURLOPT_COOKIESESSION, 1);
00382     }
00383 
00384     if (www_trace) {
00385         cerr << "Curl version: " << curl_version() << endl;
00386         curl_easy_setopt(d_curl, CURLOPT_VERBOSE, 1);
00387         curl_easy_setopt(d_curl, CURLOPT_DEBUGFUNCTION, curl_debug);
00388     }
00389 }
00390 
00394 class BuildHeaders : public unary_function<const string &, void>
00395 {
00396     struct curl_slist *d_cl;
00397 
00398 public:
00399     BuildHeaders() : d_cl(0)
00400     {}
00401 
00402     void operator()(const string &header)
00403     {
00404         DBG(cerr << "Adding '" << header.c_str() << "' to the header list."
00405             << endl);
00406         d_cl = curl_slist_append(d_cl, header.c_str());
00407     }
00408 
00409     struct curl_slist *get_headers()
00410     {
00411         return d_cl;
00412     }
00413 };
00414 
00429 long
00430 HTTPConnect::read_url(const string &url, FILE *stream, vector<string> *resp_hdrs, const vector<string> *headers)
00431 {
00432     curl_easy_setopt(d_curl, CURLOPT_URL, url.c_str());
00433 
00434 #ifdef WIN32
00435     //  See the curl documentation for CURLOPT_FILE (aka CURLOPT_WRITEDATA)
00436     //  and the CURLOPT_WRITEFUNCTION option.  Quote: "If you are using libcurl as
00437     //  a win32 DLL, you MUST use the CURLOPT_WRITEFUNCTION option if you set the
00438     //  CURLOPT_WRITEDATA option or you will experience crashes".  At the root of
00439     //  this issue is that one should not pass a FILE * to a windows DLL.  Close
00440     //  inspection of libcurl yields that their default write function when using
00441     //  the CURLOPT_WRITEDATA is just "fwrite".
00442     curl_easy_setopt(d_curl, CURLOPT_WRITEDATA, stream);
00443     curl_easy_setopt(d_curl, CURLOPT_WRITEFUNCTION, &fwrite);
00444 #else
00445     curl_easy_setopt(d_curl, CURLOPT_WRITEDATA, stream);
00446 #endif
00447 
00448     DBG(copy(d_request_headers.begin(), d_request_headers.end(),
00449              ostream_iterator<string>(cerr, "\n")));
00450 
00451     BuildHeaders req_hdrs;
00452     req_hdrs = for_each(d_request_headers.begin(), d_request_headers.end(),
00453                         req_hdrs);
00454     if (headers)
00455         req_hdrs = for_each(headers->begin(), headers->end(), req_hdrs);
00456 
00457     curl_easy_setopt(d_curl, CURLOPT_HTTPHEADER, req_hdrs.get_headers());
00458 
00459     // Turn off the proxy for this URL?
00460     bool temporary_proxy = false;
00461     if ((temporary_proxy = url_uses_no_proxy_for(url))) {
00462         DBG(cerr << "Suppress proxy for url: " << url << endl);
00463         curl_easy_setopt(d_curl, CURLOPT_PROXY, 0);
00464     }
00465 
00466     string::size_type at_sign = url.find('@');
00467     // Assume username:password present *and* assume it's an HTTP URL; it *is*
00468     // HTTPConnect, after all. 7 is position after "http://"; the second arg
00469     // to substr() is the sub string length.
00470     if (at_sign != url.npos)
00471         d_upstring = url.substr(7, at_sign - 7);
00472 
00473     if (!d_upstring.empty())
00474         curl_easy_setopt(d_curl, CURLOPT_USERPWD, d_upstring.c_str());
00475 
00476     // Pass save_raw_http_headers() a pointer to the vector<string> where the
00477     // response headers may be stored. Callers can use the resp_hdrs
00478     // value/result parameter to get the raw response header information .
00479     curl_easy_setopt(d_curl, CURLOPT_WRITEHEADER, resp_hdrs);
00480 
00481     // This is the call that causes curl to go and get the remote resource and "write it down"
00482     // utilizing the configuration state that has been previously conditioned by various perturbations
00483     // of calls to curl_easy_setopt().
00484     CURLcode res = curl_easy_perform(d_curl);
00485 
00486     // Free the header list and null the value in d_curl.
00487     curl_slist_free_all(req_hdrs.get_headers());
00488     curl_easy_setopt(d_curl, CURLOPT_HTTPHEADER, 0);
00489 
00490     // Reset the proxy?
00491     if (temporary_proxy && !d_rcr->get_proxy_server_host().empty())
00492         curl_easy_setopt(d_curl, CURLOPT_PROXY,
00493                          d_rcr->get_proxy_server_host().c_str());
00494 
00495     if (res != 0)
00496         throw Error(d_error_buffer);
00497 
00498     long status;
00499     res = curl_easy_getinfo(d_curl, CURLINFO_HTTP_CODE, &status);
00500     if (res != 0)
00501         throw Error(d_error_buffer);
00502 
00503     char *ct_ptr = 0;
00504     res = curl_easy_getinfo(d_curl, CURLINFO_CONTENT_TYPE, &ct_ptr);
00505     if (res == CURLE_OK && ct_ptr)
00506         d_content_type = ct_ptr;
00507     else
00508         d_content_type = "";
00509 
00510     return status;
00511 }
00512 
00516 bool
00517 HTTPConnect::url_uses_proxy_for(const string &url)
00518 {
00519     if (d_rcr->is_proxy_for_used()) {
00520         Regex host_regex(d_rcr->get_proxy_for_regexp().c_str());
00521         int index = 0, matchlen;
00522         return host_regex.search(url.c_str(), url.size(), matchlen, index) != -1;
00523     }
00524 
00525     return false;
00526 }
00527 
00531 bool
00532 HTTPConnect::url_uses_no_proxy_for(const string &url) throw()
00533 {
00534     return d_rcr->is_no_proxy_for_used()
00535            && url.find(d_rcr->get_no_proxy_for_host()) != string::npos;
00536 }
00537 
00538 // Public methods. Mostly...
00539 
00546 HTTPConnect::HTTPConnect(RCReader *rcr, bool use_cpp) : d_username(""), d_password(""), d_cookie_jar(""),
00547                 d_dap_client_protocol_major(2), d_dap_client_protocol_minor(0), d_use_cpp_streams(use_cpp)
00548 
00549 {
00550     d_accept_deflate = rcr->get_deflate();
00551     d_rcr = rcr;
00552 
00553     // Load in the default headers to send with a request. The empty Pragma
00554     // headers overrides libcurl's default Pragma: no-cache header (which
00555     // will disable caching by Squid, et c.). The User-Agent header helps
00556     // make server logs more readable. 05/05/03 jhrg
00557     d_request_headers.push_back(string("Pragma:"));
00558     string user_agent = string("User-Agent: ") + string(CNAME)
00559                         + string("/") + string(CVER);
00560     d_request_headers.push_back(user_agent);
00561     if (d_accept_deflate)
00562         d_request_headers.push_back(string("Accept-Encoding: deflate, gzip, compress"));
00563 
00564     // HTTPCache::instance returns a valid ptr or 0.
00565     if (d_rcr->get_use_cache())
00566         d_http_cache = HTTPCache::instance(d_rcr->get_dods_cache_root(),true);
00567     else
00568         d_http_cache = 0;
00569 
00570     DBG2(cerr << "Cache object created (" << hex << d_http_cache << dec
00571          << ")" << endl);
00572 
00573     if (d_http_cache) {
00574         d_http_cache->set_cache_enabled(d_rcr->get_use_cache());
00575         d_http_cache->set_expire_ignored(d_rcr->get_ignore_expires() != 0);
00576         d_http_cache->set_max_size(d_rcr->get_max_cache_size());
00577         d_http_cache->set_max_entry_size(d_rcr->get_max_cached_obj());
00578         d_http_cache->set_default_expiration(d_rcr->get_default_expires());
00579         d_http_cache->set_always_validate(d_rcr->get_always_validate() != 0);
00580     }
00581 
00582     d_cookie_jar = rcr->get_cookie_jar();
00583 
00584     www_lib_init();  // This may throw either Error or InternalErr
00585 }
00586 
00587 HTTPConnect::~HTTPConnect()
00588 {
00589     DBG2(cerr << "Entering the HTTPConnect dtor" << endl);
00590 
00591     curl_easy_cleanup(d_curl);
00592 
00593     DBG2(cerr << "Leaving the HTTPConnect dtor" << endl);
00594 }
00595 
00597 class HeaderMatch : public unary_function<const string &, bool> {
00598     const string &d_header;
00599     public:
00600         HeaderMatch(const string &header) : d_header(header) {}
00601         bool operator()(const string &arg) { return arg.find(d_header) == 0; }
00602 };
00603 
00616 HTTPResponse *
00617 HTTPConnect::fetch_url(const string &url)
00618 {
00619 #ifdef HTTP_TRACE
00620     cout << "GET " << url << " HTTP/1.0" << endl;
00621 #endif
00622 
00623     HTTPResponse *stream;
00624 
00625     if (/*d_http_cache && d_http_cache->*/is_cache_enabled()) {
00626         stream = caching_fetch_url(url);
00627     }
00628     else {
00629         stream = plain_fetch_url(url);
00630     }
00631 
00632 #ifdef HTTP_TRACE
00633         stringstream ss;
00634         ss << "HTTP/1.0 " << stream->get_status() << " -" << endl;
00635         for (size_t i = 0; i < stream->get_headers()->size(); i++) {
00636                 ss << stream->get_headers()->at(i) << endl;
00637         }
00638         cout << ss.str();
00639 #endif
00640 
00641     ParseHeader parser;
00642 
00643     // An apparent quirk of libcurl is that it does not pass the Content-type
00644     // header to the callback used to save them, but check and add it from the
00645     // saved state variable only if it's not there (without this a test failed
00646     // in HTTPCacheTest). jhrg 11/12/13
00647     if (!d_content_type.empty() && find_if(stream->get_headers()->begin(), stream->get_headers()->end(),
00648                                                                            HeaderMatch("Content-Type:")) == stream->get_headers()->end())
00649         stream->get_headers()->push_back("Content-Type: " + d_content_type);
00650 
00651     parser = for_each(stream->get_headers()->begin(), stream->get_headers()->end(), ParseHeader());
00652 
00653 #ifdef HTTP_TRACE
00654     cout << endl << endl;
00655 #endif
00656 
00657     // handle redirection case (2007-04-27, gaffigan@sfos.uaf.edu)
00658     if (parser.get_location() != "" &&
00659             url.substr(0,url.find("?",0)).compare(parser.get_location().substr(0,url.find("?",0))) != 0) {
00660         delete stream;
00661         return fetch_url(parser.get_location());
00662     }
00663 
00664     stream->set_type(parser.get_object_type()); // uses the value of content-description
00665 
00666     stream->set_version(parser.get_server());
00667     stream->set_protocol(parser.get_protocol());
00668 
00669     if (d_use_cpp_streams) {
00670         stream->transform_to_cpp();
00671     }
00672 
00673     return stream;
00674 }
00675 
00676 // Look around for a reasonable place to put a temporary file. Check first
00677 // the value of the TMPDIR env var. If that does not yeild a path that's
00678 // writable (as defined by access(..., W_OK|R_OK)) then look at P_tmpdir (as
00679 // defined in stdio.h. If both come up empty, then use `./'.
00680 
00681 // Change this to a version that either returns a string or an open file
00682 // descriptor. Use information from https://buildsecurityin.us-cert.gov/
00683 // (see open()) to make it more secure. Ideal solution: get deserialize()
00684 // methods to read from a stream returned by libcurl, not from a temporary
00685 // file. 9/21/07 jhrg Updated to use strings, other misc changes. 3/22/11
00686 static string
00687 get_tempfile_template(const string &file_template)
00688 {
00689     string c;
00690 
00691     // Windows has one idea of the standard name(s) for a temporary files dir
00692 #ifdef WIN32
00693     // white list for a WIN32 directory
00694     Regex directory("[-a-zA-Z0-9_:\\]*");
00695 
00696     // If we're OK to use getenv(), try it.
00697 #ifdef USE_GETENV
00698     c = getenv("TEMP");
00699     if (c && directory.match(c.c_str(), c.length()) && (access(c.c_str(), 6) == 0))
00700         goto valid_temp_directory;
00701 
00702     c= getenv("TMP");
00703     if (c && directory.match(c.c_str(), c.length()) && (access(c.c_str(), 6) == 0))
00704         goto valid_temp_directory;
00705 #endif // USE_GETENV
00706 
00707     // The windows default
00708     c = "c:\tmp";
00709     if (c && directory.match(c.c_str(), c.length()) && (access(c.c_str(), 6) == 0))
00710         goto valid_temp_directory;
00711 
00712 #else   // Unix/Linux/OSX has another...
00713     // white list for a directory
00714     Regex directory("[-a-zA-Z0-9_/]*");
00715 #ifdef USE_GETENV
00716     c = getenv("TMPDIR");
00717     if (directory.match(c.c_str(), c.length()) && (access(c.c_str(), W_OK | R_OK) == 0))
00718         goto valid_temp_directory;
00719 #endif // USE_GETENV
00720 
00721     // Unix defines this sometimes - if present, use it.
00722 #ifdef P_tmpdir
00723     if (access(P_tmpdir, W_OK | R_OK) == 0) {
00724         c = P_tmpdir;
00725         goto valid_temp_directory;
00726     }
00727 #endif
00728 
00729     // The Unix default
00730     c = "/tmp";
00731     if (directory.match(c.c_str(), c.length()) && (access(c.c_str(), W_OK | R_OK) == 0))
00732         goto valid_temp_directory;
00733 
00734 #endif  // WIN32
00735 
00736     // If we found nothing useful, use the current directory
00737     c = ".";
00738 
00739 valid_temp_directory:
00740 
00741 #ifdef WIN32
00742     c += "\\" + file_template;
00743 #else
00744     c += "/" + file_template;
00745 #endif
00746 
00747     return c;
00748 }
00749 
00768 string
00769 get_temp_file(FILE *&stream) throw(Error)
00770 {
00771     string dods_temp = get_tempfile_template((string)"dodsXXXXXX");
00772 
00773     vector<char> pathname(dods_temp.length() + 1);
00774 
00775     strncpy(&pathname[0], dods_temp.c_str(), dods_temp.length());
00776 
00777     DBG(cerr << "pathanme: " << &pathname[0] << " (" << dods_temp.length() + 1 << ")" << endl);
00778 
00779     // Open truncated for update. NB: mkstemp() returns a file descriptor.
00780 #if defined(WIN32) || defined(TEST_WIN32_TEMPS)
00781     stream = fopen(_mktemp(&pathname[0]), "w+b");
00782 #else
00783     // Make sure that temp files are accessible only by the owner.
00784     int mask = umask(077);
00785     if (mask < 0)
00786         throw Error("Could not set the file creation mask: " + string(strerror(errno)));
00787     int fd = mkstemp(&pathname[0]);
00788     if (fd < 0)
00789         throw Error("Could not create a temporary file to store the response: " + string(strerror(errno)));
00790 
00791     stream = fdopen(fd, "w+");
00792     umask(mask);
00793 #endif
00794 
00795     if (!stream)
00796         throw Error("Failed to open a temporary file for the data values (" + dods_temp + ")");
00797 
00798     dods_temp = &pathname[0];
00799     return dods_temp;
00800 }
00801 
00802 
00808 void
00809 close_temp(FILE *s, const string &name)
00810 {
00811     int res = fclose(s);
00812     if (res)
00813         throw InternalErr(__FILE__, __LINE__, "!FAIL! " + long_to_string(res));
00814 
00815     res = unlink(name.c_str());
00816     if (res != 0)
00817         throw InternalErr(__FILE__, __LINE__, "!FAIL! " + long_to_string(res));
00818 }
00819 
00841 HTTPResponse *
00842 HTTPConnect::caching_fetch_url(const string &url)
00843 {
00844     DBG(cerr << "Is this URL (" << url << ") in the cache?... ");
00845 
00846     vector<string> *headers = new vector<string>;
00847     string file_name;
00848     FILE *s = d_http_cache->get_cached_response(url, *headers, file_name);
00849     if (!s) {
00850         // url not in cache; get it and cache it
00851         DBGN(cerr << "no; getting response and caching." << endl);
00852         delete headers; headers = 0;
00853         time_t now = time(0);
00854         HTTPResponse *rs = plain_fetch_url(url);
00855         d_http_cache->cache_response(url, now, *(rs->get_headers()), rs->get_stream());
00856 
00857         return rs;
00858     }
00859     else { // url in cache
00860         DBGN(cerr << "yes... ");
00861 
00862         if (d_http_cache->is_url_valid(url)) { // url in cache and valid
00863             DBGN(cerr << "and it's valid; using cached response." << endl);
00864             HTTPCacheResponse *crs = new HTTPCacheResponse(s, 200, headers, file_name, d_http_cache);
00865             return crs;
00866         }
00867         else { // url in cache but not valid; validate
00868             DBGN(cerr << "but it's not valid; validating... ");
00869 
00870             d_http_cache->release_cached_response(s); // This closes 's'
00871             headers->clear();
00872             vector<string> cond_hdrs = d_http_cache->get_conditional_request_headers(url);
00873             FILE *body = 0;
00874             string dods_temp = get_temp_file(body);
00875             time_t now = time(0); // When was the request made (now).
00876             long http_status;
00877 
00878             try {
00879                 http_status = read_url(url, body, /*resp_hdrs*/headers, &cond_hdrs);
00880                 rewind(body);
00881             }
00882             catch (Error &e) {
00883                 close_temp(body, dods_temp);
00884                 delete headers;
00885                 throw ;
00886             }
00887 
00888             switch (http_status) {
00889                 case 200: { // New headers and new body
00890                     DBGN(cerr << "read a new response; caching." << endl);
00891 
00892                     d_http_cache->cache_response(url, now, /* *resp_hdrs*/*headers, body);
00893                     HTTPResponse *rs = new HTTPResponse(body, http_status, /*resp_hdrs*/headers, dods_temp);
00894 
00895                     return rs;
00896                 }
00897 
00898                 case 304: { // Just new headers, use cached body
00899                     DBGN(cerr << "cached response valid; updating." << endl);
00900 
00901                     close_temp(body, dods_temp);
00902                     d_http_cache->update_response(url, now, /* *resp_hdrs*/ *headers);
00903                     string file_name;
00904                     FILE *hs = d_http_cache->get_cached_response(url, *headers, file_name);
00905                     HTTPCacheResponse *crs = new HTTPCacheResponse(hs, 304, headers, file_name, d_http_cache);
00906                     return crs;
00907                 }
00908 
00909                 default: { // Oops.
00910                     close_temp(body, dods_temp);
00911                     if (http_status >= 400) {
00912                         delete headers; headers = 0;
00913                         string msg = "Error while reading the URL: ";
00914                         msg += url;
00915                         msg
00916                         += ".\nThe OPeNDAP server returned the following message:\n";
00917                         msg += http_status_to_string(http_status);
00918                         throw Error(msg);
00919                     }
00920                     else {
00921                         delete headers; headers = 0;
00922                         throw InternalErr(__FILE__, __LINE__,
00923                                 "Bad response from the HTTP server: " + long_to_string(http_status));
00924                     }
00925                 }
00926             }
00927         }
00928     }
00929 
00930     throw InternalErr(__FILE__, __LINE__, "Should never get here");
00931 }
00932 
00944 HTTPResponse *
00945 HTTPConnect::plain_fetch_url(const string &url)
00946 {
00947         DBG(cerr << "Getting URL: " << url << endl);
00948         FILE *stream = 0;
00949         string dods_temp = get_temp_file(stream);
00950         vector<string> *resp_hdrs = new vector<string>;
00951 
00952         int status = -1;
00953         try {
00954                 status = read_url(url, stream, resp_hdrs); // Throws Error.
00955                 if (status >= 400) {
00956                         // delete resp_hdrs; resp_hdrs = 0;
00957                         string msg = "Error while reading the URL: ";
00958                         msg += url;
00959                         msg += ".\nThe OPeNDAP server returned the following message:\n";
00960                         msg += http_status_to_string(status);
00961                         throw Error(msg);
00962                 }
00963         }
00964 
00965         catch (Error &e) {
00966                 delete resp_hdrs;
00967                 close_temp(stream, dods_temp);
00968                 throw;
00969         }
00970 
00971 #if 0
00972         if (d_use_cpp_streams) {
00973                 fclose(stream);
00974                 fstream *in = new fstream(dods_temp.c_str(), ios::in|ios::binary);
00975                 return new HTTPResponse(in, status, resp_hdrs, dods_temp);
00976         }
00977         else {
00978 #endif
00979         rewind(stream);
00980         return new HTTPResponse(stream, status, resp_hdrs, dods_temp);
00981 #if 0
00982 }
00983 #endif
00984 }
00985 
00997 void
00998 HTTPConnect::set_accept_deflate(bool deflate)
00999 {
01000     d_accept_deflate = deflate;
01001 
01002     if (d_accept_deflate) {
01003         if (find(d_request_headers.begin(), d_request_headers.end(),
01004                  "Accept-Encoding: deflate, gzip, compress") == d_request_headers.end())
01005             d_request_headers.push_back(string("Accept-Encoding: deflate, gzip, compress"));
01006         DBG(copy(d_request_headers.begin(), d_request_headers.end(),
01007                  ostream_iterator<string>(cerr, "\n")));
01008     }
01009     else {
01010         vector<string>::iterator i;
01011         i = remove_if(d_request_headers.begin(), d_request_headers.end(),
01012                       bind2nd(equal_to<string>(),
01013                               string("Accept-Encoding: deflate, gzip, compress")));
01014         d_request_headers.erase(i, d_request_headers.end());
01015     }
01016 }
01017 
01026 void
01027 HTTPConnect::set_xdap_protocol(int major, int minor)
01028 {
01029     // Look for, and remove if one exists, an XDAP-Accept header
01030     vector<string>::iterator i;
01031     i = find_if(d_request_headers.begin(), d_request_headers.end(),
01032                 HeaderMatch("XDAP-Accept:"));
01033     if (i != d_request_headers.end())
01034         d_request_headers.erase(i);
01035 
01036     // Record and add the new header value
01037     d_dap_client_protocol_major = major;
01038     d_dap_client_protocol_minor = minor;
01039     ostringstream xdap_accept;
01040     xdap_accept << "XDAP-Accept: " << major << "." << minor;
01041 
01042     d_request_headers.push_back(xdap_accept.str());
01043 
01044     DBG(copy(d_request_headers.begin(), d_request_headers.end(),
01045              ostream_iterator<string>(cerr, "\n")));
01046 }
01047 
01063 void
01064 HTTPConnect::set_credentials(const string &u, const string &p)
01065 {
01066     if (u.empty())
01067         return;
01068 
01069     // Store the credentials locally.
01070     d_username = u;
01071     d_password = p;
01072 
01073     d_upstring = u + ":" + p;
01074 }
01075 
01076 } // namespace libdap