pion  5.0.6
include/pion/http/parser.hpp
00001 // ---------------------------------------------------------------------
00002 // pion:  a Boost C++ framework for building lightweight HTTP interfaces
00003 // ---------------------------------------------------------------------
00004 // Copyright (C) 2007-2014 Splunk Inc.  (https://github.com/splunk/pion)
00005 //
00006 // Distributed under the Boost Software License, Version 1.0.
00007 // See http://www.boost.org/LICENSE_1_0.txt
00008 //
00009 
00010 #ifndef __PION_HTTP_PARSER_HEADER__
00011 #define __PION_HTTP_PARSER_HEADER__
00012 
00013 #include <string>
00014 #include <boost/noncopyable.hpp>
00015 #include <boost/function/function2.hpp>
00016 #include <boost/logic/tribool.hpp>
00017 #include <boost/system/error_code.hpp>
00018 #include <boost/thread/once.hpp>
00019 #include <pion/config.hpp>
00020 #include <pion/logger.hpp>
00021 #include <pion/http/message.hpp>
00022 
00023 #ifndef BOOST_SYSTEM_NOEXCEPT
00024     #define BOOST_SYSTEM_NOEXCEPT BOOST_NOEXCEPT
00025 #endif
00026 
00027 
00028 namespace pion {    // begin namespace pion
00029 namespace http {    // begin namespace http
00030 
00031 
00032 // forward declarations used for finishing HTTP messages
00033 class request;
00034 class response;
00035 
00039 class PION_API parser :
00040     private boost::noncopyable
00041 {
00042 
00043 public:
00044 
00046     static const std::size_t        DEFAULT_CONTENT_MAX;
00047 
00049     typedef boost::function2<void, const char *, std::size_t>   payload_handler_t;
00050     
00052     enum error_value_t {
00053         ERROR_METHOD_CHAR = 1,
00054         ERROR_METHOD_SIZE,
00055         ERROR_URI_CHAR,
00056         ERROR_URI_SIZE,
00057         ERROR_QUERY_CHAR,
00058         ERROR_QUERY_SIZE,
00059         ERROR_VERSION_EMPTY,
00060         ERROR_VERSION_CHAR,
00061         ERROR_STATUS_EMPTY,
00062         ERROR_STATUS_CHAR,
00063         ERROR_HEADER_CHAR,
00064         ERROR_HEADER_NAME_SIZE,
00065         ERROR_HEADER_VALUE_SIZE,
00066         ERROR_INVALID_CONTENT_LENGTH,
00067         ERROR_CHUNK_CHAR,
00068         ERROR_MISSING_CHUNK_DATA,
00069         ERROR_MISSING_HEADER_DATA,
00070         ERROR_MISSING_TOO_MUCH_CONTENT,
00071     };
00072     
00074     class error_category_t
00075         : public boost::system::error_category
00076     {
00077     public:
00078         const char *name() const BOOST_SYSTEM_NOEXCEPT { return "parser"; }
00079         std::string message(int ev) const {
00080             switch (ev) {
00081             case ERROR_METHOD_CHAR:
00082                 return "invalid method character";
00083             case ERROR_METHOD_SIZE:
00084                 return "method exceeds maximum size";
00085             case ERROR_URI_CHAR:
00086                 return "invalid URI character";
00087             case ERROR_URI_SIZE:
00088                 return "method exceeds maximum size";
00089             case ERROR_QUERY_CHAR:
00090                 return "invalid query string character";
00091             case ERROR_QUERY_SIZE:
00092                 return "query string exceeds maximum size";
00093             case ERROR_VERSION_EMPTY:
00094                 return "HTTP version undefined";
00095             case ERROR_VERSION_CHAR:
00096                 return "invalid version character";
00097             case ERROR_STATUS_EMPTY:
00098                 return "HTTP status undefined";
00099             case ERROR_STATUS_CHAR:
00100                 return "invalid status character";
00101             case ERROR_HEADER_CHAR:
00102                 return "invalid header character";
00103             case ERROR_HEADER_NAME_SIZE:
00104                 return "header name exceeds maximum size";
00105             case ERROR_HEADER_VALUE_SIZE:
00106                 return "header value exceeds maximum size";
00107             case ERROR_INVALID_CONTENT_LENGTH:
00108                 return "invalid Content-Length header";
00109             case ERROR_CHUNK_CHAR:
00110                 return "invalid chunk character";
00111             case ERROR_MISSING_HEADER_DATA:
00112                 return "missing header data";
00113             case ERROR_MISSING_CHUNK_DATA:
00114                 return "missing chunk data";
00115             case ERROR_MISSING_TOO_MUCH_CONTENT:
00116                 return "missing too much content";
00117             }
00118             return "parser error";
00119         }
00120     };
00121 
00129     parser(const bool is_request, std::size_t max_content_length = DEFAULT_CONTENT_MAX)
00130         : m_logger(PION_GET_LOGGER("pion.http.parser")), m_is_request(is_request),
00131         m_read_ptr(NULL), m_read_end_ptr(NULL), m_message_parse_state(PARSE_START),
00132         m_headers_parse_state(is_request ? PARSE_METHOD_START : PARSE_HTTP_VERSION_H),
00133         m_chunked_content_parse_state(PARSE_CHUNK_SIZE_START), m_status_code(0),
00134         m_bytes_content_remaining(0), m_bytes_content_read(0),
00135         m_bytes_last_read(0), m_bytes_total_read(0),
00136         m_max_content_length(max_content_length),
00137         m_parse_headers_only(false), m_save_raw_headers(false)
00138     {}
00139 
00141     virtual ~parser() {}
00142 
00154     boost::tribool parse(http::message& http_msg, boost::system::error_code& ec);
00155 
00168     boost::tribool parse_missing_data(http::message& http_msg, std::size_t len,
00169         boost::system::error_code& ec);
00170 
00176     void finish(http::message& http_msg) const;
00177 
00184     inline void set_read_buffer(const char *ptr, size_t len) {
00185         m_read_ptr = ptr;
00186         m_read_end_ptr = ptr + len;
00187     }
00188 
00195     inline void load_read_pos(const char *&read_ptr, const char *&read_end_ptr) const {
00196         read_ptr = m_read_ptr;
00197         read_end_ptr = m_read_end_ptr;
00198     }
00199 
00208     inline bool check_premature_eof(http::message& http_msg) {
00209         if (m_message_parse_state != PARSE_CONTENT_NO_LENGTH)
00210             return true;
00211         m_message_parse_state = PARSE_END;
00212         http_msg.concatenate_chunks();
00213         finish(http_msg);
00214         return false;
00215     }
00216 
00222     inline void parse_headers_only(bool b = true) { m_parse_headers_only = b; }
00223 
00229     inline void skip_header_parsing(http::message& http_msg) {
00230         boost::system::error_code ec;
00231         finish_header_parsing(http_msg, ec);
00232     }
00233     
00235     inline void reset(void) {
00236         m_message_parse_state = PARSE_START;
00237         m_headers_parse_state = (m_is_request ? PARSE_METHOD_START : PARSE_HTTP_VERSION_H);
00238         m_chunked_content_parse_state = PARSE_CHUNK_SIZE_START;
00239         m_status_code = 0;
00240         m_status_message.erase();
00241         m_method.erase();
00242         m_resource.erase();
00243         m_query_string.erase();
00244         m_raw_headers.erase();
00245         m_bytes_content_read = m_bytes_last_read = m_bytes_total_read = 0;
00246     }
00247 
00249     inline bool eof(void) const { return m_read_ptr == NULL || m_read_ptr >= m_read_end_ptr; }
00250 
00252     inline std::size_t bytes_available(void) const { return (eof() ? 0 : (std::size_t)(m_read_end_ptr - m_read_ptr)); } 
00253 
00255     inline std::size_t gcount(void) const { return m_bytes_last_read; }
00256 
00258     inline std::size_t get_total_bytes_read(void) const { return m_bytes_total_read; }
00259 
00261     inline std::size_t get_content_bytes_read(void) const { return m_bytes_content_read; }
00262 
00264     inline std::size_t get_max_content_length(void) const { return m_max_content_length; }
00265 
00267     inline const std::string& get_raw_headers(void) const { return m_raw_headers; }
00268 
00270     inline bool get_save_raw_headers(void) const { return m_save_raw_headers; }
00271 
00273     inline bool get_parse_headers_only(void) { return m_parse_headers_only; }
00274     
00276     inline bool is_parsing_request(void) const { return m_is_request; }
00277 
00279     inline bool is_parsing_response(void) const { return ! m_is_request; }
00280 
00282     inline void set_payload_handler(payload_handler_t& h) { m_payload_handler = h; }
00283 
00285     inline void set_max_content_length(std::size_t n) { m_max_content_length = n; }
00286 
00288     inline void reset_max_content_length(void) { m_max_content_length = DEFAULT_CONTENT_MAX; }
00289 
00291     inline void set_save_raw_headers(bool b) { m_save_raw_headers = b; }
00292 
00294     inline void set_logger(logger log_ptr) { m_logger = log_ptr; }
00295 
00297     inline logger get_logger(void) { return m_logger; }
00298 
00299 
00312     static bool parse_uri(const std::string& uri, std::string& proto, 
00313                          std::string& host, boost::uint16_t& port, std::string& path,
00314                          std::string& query);
00315 
00326     static bool parse_url_encoded(ihash_multimap& dict,
00327                                 const char *ptr, const std::size_t len);
00328 
00340     static bool parse_multipart_form_data(ihash_multimap& dict,
00341                                           const std::string& content_type,
00342                                           const char *ptr, const std::size_t len);
00343     
00355     static bool parse_cookie_header(ihash_multimap& dict,
00356                                   const char *ptr, const std::size_t len,
00357                                   bool set_cookie_header);
00358 
00369     static inline bool parse_cookie_header(ihash_multimap& dict,
00370         const std::string& cookie_header, bool set_cookie_header)
00371     {
00372         return parse_cookie_header(dict, cookie_header.c_str(), cookie_header.size(), set_cookie_header);
00373     }
00374 
00384     static inline bool parse_url_encoded(ihash_multimap& dict,
00385         const std::string& query)
00386     {
00387         return parse_url_encoded(dict, query.c_str(), query.size());
00388     }
00389     
00400     static inline bool parse_multipart_form_data(ihash_multimap& dict,
00401                                                  const std::string& content_type,
00402                                                  const std::string& form_data)
00403     {
00404         return parse_multipart_form_data(dict, content_type, form_data.c_str(), form_data.size());
00405     }
00406     
00419     boost::tribool finish_header_parsing(http::message& http_msg,
00420                                          boost::system::error_code& ec);
00421 
00431     static bool parse_forwarded_for(const std::string& header, std::string& public_ip);
00432     
00434     static inline error_category_t& get_error_category(void) {
00435         boost::call_once(parser::create_error_category, m_instance_flag);
00436         return *m_error_category_ptr;
00437     }
00438 
00439 
00440 protected:
00441 
00443     virtual void finished_parsing_headers(const boost::system::error_code& ec) {}
00444     
00457     boost::tribool parse_headers(http::message& http_msg, boost::system::error_code& ec);
00458 
00464     void update_message_with_header_data(http::message& http_msg) const;
00465 
00477     boost::tribool parse_chunks(http::message::chunk_cache_t& chunk_buffers,
00478         boost::system::error_code& ec);
00479 
00491     boost::tribool consume_content(http::message& http_msg,
00492         boost::system::error_code& ec);
00493 
00501     std::size_t consume_content_as_next_chunk(http::message::chunk_cache_t& chunk_buffers);
00502 
00508     static void compute_msg_status(http::message& http_msg, bool msg_parsed_ok);
00509 
00516     static inline void set_error(boost::system::error_code& ec, error_value_t ev) {
00517         ec = boost::system::error_code(static_cast<int>(ev), get_error_category());
00518     }
00519 
00521     static void create_error_category(void);
00522 
00523 
00524     // misc functions used by the parsing functions
00525     inline static bool is_char(int c);
00526     inline static bool is_control(int c);
00527     inline static bool is_special(int c);
00528     inline static bool is_digit(int c);
00529     inline static bool is_hex_digit(int c);
00530     inline static bool is_cookie_attribute(const std::string& name, bool set_cookie_header);
00531 
00532 
00534     static const boost::uint32_t        STATUS_MESSAGE_MAX;
00535 
00537     static const boost::uint32_t        METHOD_MAX;
00538 
00540     static const boost::uint32_t        RESOURCE_MAX;
00541 
00543     static const boost::uint32_t        QUERY_STRING_MAX;
00544 
00546     static const boost::uint32_t        HEADER_NAME_MAX;
00547 
00549     static const boost::uint32_t        HEADER_VALUE_MAX;
00550 
00552     static const boost::uint32_t        QUERY_NAME_MAX;
00553 
00555     static const boost::uint32_t        QUERY_VALUE_MAX;
00556 
00558     static const boost::uint32_t        COOKIE_NAME_MAX;
00559 
00561     static const boost::uint32_t        COOKIE_VALUE_MAX;
00562 
00563 
00565     mutable logger                      m_logger;
00566 
00568     const bool                          m_is_request;
00569 
00571     const char *                        m_read_ptr;
00572 
00574     const char *                        m_read_end_ptr;
00575 
00576 
00577 private:
00578 
00580     enum message_parse_state_t {
00581         PARSE_START, PARSE_HEADERS, PARSE_FOOTERS, PARSE_CONTENT,
00582         PARSE_CONTENT_NO_LENGTH, PARSE_CHUNKS, PARSE_END
00583     };
00584 
00587     enum header_parse_state_t {
00588         PARSE_METHOD_START, PARSE_METHOD, PARSE_URI_STEM, PARSE_URI_QUERY,
00589         PARSE_HTTP_VERSION_H, PARSE_HTTP_VERSION_T_1, PARSE_HTTP_VERSION_T_2,
00590         PARSE_HTTP_VERSION_P, PARSE_HTTP_VERSION_SLASH,
00591         PARSE_HTTP_VERSION_MAJOR_START, PARSE_HTTP_VERSION_MAJOR,
00592         PARSE_HTTP_VERSION_MINOR_START, PARSE_HTTP_VERSION_MINOR,
00593         PARSE_STATUS_CODE_START, PARSE_STATUS_CODE, PARSE_STATUS_MESSAGE,
00594         PARSE_EXPECTING_NEWLINE, PARSE_EXPECTING_CR,
00595         PARSE_HEADER_WHITESPACE, PARSE_HEADER_START, PARSE_HEADER_NAME,
00596         PARSE_SPACE_BEFORE_HEADER_VALUE, PARSE_HEADER_VALUE,
00597         PARSE_EXPECTING_FINAL_NEWLINE, PARSE_EXPECTING_FINAL_CR
00598     };
00599 
00602     enum chunk_parse_state_t {
00603         PARSE_CHUNK_SIZE_START, PARSE_CHUNK_SIZE,
00604         PARSE_EXPECTING_IGNORED_TEXT_AFTER_CHUNK_SIZE,
00605         PARSE_EXPECTING_CR_AFTER_CHUNK_SIZE,
00606         PARSE_EXPECTING_LF_AFTER_CHUNK_SIZE, PARSE_CHUNK, 
00607         PARSE_EXPECTING_CR_AFTER_CHUNK, PARSE_EXPECTING_LF_AFTER_CHUNK,
00608         PARSE_EXPECTING_FINAL_CR_OR_FOOTERS_AFTER_LAST_CHUNK, 
00609         PARSE_EXPECTING_FINAL_LF_AFTER_LAST_CHUNK
00610     };
00611 
00612 
00614     message_parse_state_t               m_message_parse_state;
00615 
00617     header_parse_state_t                m_headers_parse_state;
00618 
00620     chunk_parse_state_t                 m_chunked_content_parse_state;
00621     
00623     payload_handler_t                   m_payload_handler;
00624 
00626     boost::uint16_t                     m_status_code;
00627 
00629     std::string                         m_status_message;
00630 
00632     std::string                         m_method;
00633 
00635     std::string                         m_resource;
00636 
00638     std::string                         m_query_string;
00639 
00641     std::string                         m_raw_headers;
00642 
00644     std::string                         m_header_name;
00645 
00647     std::string                         m_header_value;
00648 
00650     std::string                         m_chunk_size_str;
00651 
00653     std::size_t                         m_size_of_current_chunk;
00654 
00656     std::size_t                         m_bytes_read_in_current_chunk;
00657 
00659     std::size_t                         m_bytes_content_remaining;
00660 
00662     std::size_t                         m_bytes_content_read;
00663 
00665     std::size_t                         m_bytes_last_read;
00666 
00668     std::size_t                         m_bytes_total_read;
00669 
00671     std::size_t                         m_max_content_length;
00672     
00674     bool                                m_parse_headers_only;
00675 
00677     bool                                m_save_raw_headers;
00678 
00680     static error_category_t *           m_error_category_ptr;
00681         
00683     static boost::once_flag             m_instance_flag;
00684 };
00685 
00686 
00687 // inline functions for parser
00688 
00689 inline bool parser::is_char(int c)
00690 {
00691     return(c >= 0 && c <= 127);
00692 }
00693 
00694 inline bool parser::is_control(int c)
00695 {
00696     return( (c >= 0 && c <= 31) || c == 127);
00697 }
00698 
00699 inline bool parser::is_special(int c)
00700 {
00701     switch (c) {
00702     case '(': case ')': case '<': case '>': case '@':
00703     case ',': case ';': case ':': case '\\': case '"':
00704     case '/': case '[': case ']': case '?': case '=':
00705     case '{': case '}': case ' ': case '\t':
00706         return true;
00707     default:
00708         return false;
00709     }
00710 }
00711 
00712 inline bool parser::is_digit(int c)
00713 {
00714     return(c >= '0' && c <= '9');
00715 }
00716 
00717 inline bool parser::is_hex_digit(int c)
00718 {
00719     return((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'));
00720 }
00721 
00722 inline bool parser::is_cookie_attribute(const std::string& name, bool set_cookie_header)
00723 {
00724     return (name.empty() || name[0] == '$' || (set_cookie_header &&
00725         (
00726             // This is needed because of a very lenient determination in parse_cookie_header() of what
00727             // qualifies as a cookie-pair in a Set-Cookie header.
00728             // According to RFC 6265, everything after the first semicolon is a cookie attribute, but RFC 2109,
00729             // which is obsolete, allowed multiple comma separated cookies.
00730             // parse_cookie_header() is very conservatively assuming that any <name>=<value> pair in a
00731             // Set-Cookie header is a cookie-pair unless <name> is a known cookie attribute.
00732                boost::algorithm::iequals(name, "Comment")
00733             || boost::algorithm::iequals(name, "Domain")
00734             || boost::algorithm::iequals(name, "Max-Age")
00735             || boost::algorithm::iequals(name, "Path")
00736             || boost::algorithm::iequals(name, "Secure")
00737             || boost::algorithm::iequals(name, "Version")
00738             || boost::algorithm::iequals(name, "Expires")
00739             || boost::algorithm::iequals(name, "HttpOnly")
00740         )
00741     ));
00742 }
00743 
00744 }   // end namespace http
00745 }   // end namespace pion
00746 
00747 #endif