Documentation for pulsar 0.9.2. For development docs, go here.

Source code for pulsar.utils.httpurl

'''This is a substantial module which imporfts several classes and functions
from the standard library in a python 2.6 to python 3.3 compatible fashion.
On top of that, it implements the :class:`HttpClient` for handling synchronous
and asynchronous HTTP requests in a pythonic way.

It is a thin layer on top of urllib2 in python2 / urllib in Python 3.
Several opensource efforts have been used as source of snippets:

* http-parser_
* request_
* urllib3_
* werkzeug_


.. _tools-http-headers:

HTTP Headers
~~~~~~~~~~~~~~~~~

.. autoclass:: Headers
   :members:
   :member-order: bysource


.. _tools-http-parser:

HTTP Parser
~~~~~~~~~~~~~~~~~

.. autoclass:: HttpParser
   :members:
   :member-order: bysource


.. _http-parser: https://github.com/benoitc/http-parser
.. _urllib3: https://github.com/shazow/urllib3
.. _request: https://github.com/kennethreitz/requests
.. _werkzeug: https://github.com/mitsuhiko/werkzeug
.. _`HTTP cookie`: http://en.wikipedia.org/wiki/HTTP_cookie
'''
import os
import sys
import re
import string
import time
import mimetypes
import platform
import socket
from hashlib import sha1, md5
from uuid import uuid4
from email.utils import formatdate
from io import BytesIO
import zlib
from collections import deque

from .structures import mapping_iterator, OrderedDict
from .pep import ispy3k, iteritems, itervalues, to_bytes, native_str
from .html import capfirst

# try:
#     from http_parser.parser import HttpParser as CHttpParser
#     hasextensions = True
#     _Http_Parser = CHttpParser
# except ImportError:  # pragma    nocover
#     hasextensions = False
#     _Http_Parser = None
#
# The http_parser has several bugs, therefore it is switched off
hasextensions = False
_Http_Parser = None

try:
    from select import poll, POLLIN
except ImportError:   # pragma    nocover
    poll = False
    try:
        from select import select
    except ImportError:  # pragma    nocover
        select = False


def setDefaultHttpParser(parser):   # pragma    nocover
    global _Http_Parser
    _Http_Parser = parser


def http_parser(**kwargs):
    global _Http_Parser
    return _Http_Parser(**kwargs)


create_connection = socket.create_connection

try:    # Compiled with SSL?
    BaseSSLError = None
    ssl = None
    import ssl
    BaseSSLError = ssl.SSLError
except (ImportError, AttributeError):   # pragma : no cover
    pass

if ispy3k:  # Python 3
    from urllib import request as urllibr
    from http import client as httpclient
    from urllib.parse import (quote, unquote, urlencode, urlparse, urlsplit,
                              parse_qs, parse_qsl, splitport, urlunparse,
                              urljoin)
    from http.client import responses
    from http.cookiejar import CookieJar, Cookie
    from http.cookies import SimpleCookie

    string_type = str
    getproxies_environment = urllibr.getproxies_environment
    ascii_letters = string.ascii_letters
    chr = chr
    is_string = lambda s: isinstance(s, str)

    def force_native_str(s, encoding=None):
        if isinstance(s, bytes):
            return s.decode(encoding or 'utf-8')
        elif not isinstance(s, str):
            return str(s)
        else:
            return s

else:   # pragma : no cover
    import urllib2 as urllibr
    import httplib as httpclient
    from urllib import (quote, unquote, urlencode, getproxies_environment,
                        splitport)
    from urlparse import (urlparse, urlsplit, parse_qs, urlunparse, urljoin,
                          parse_qsl)
    from httplib import responses
    from cookielib import CookieJar, Cookie
    from Cookie import SimpleCookie

    string_type = unicode
    ascii_letters = string.letters
    chr = unichr
    is_string = lambda s: isinstance(s, unicode)

    if sys.version_info < (2, 7):
        #
        def create_connection(address, timeout=socket._GLOBAL_DEFAULT_TIMEOUT,
                              source_address=None):
            """Form Python 2.7"""
            host, port = address
            err = None
            for res in socket.getaddrinfo(host, port, 0, socket.SOCK_STREAM):
                af, socktype, proto, canonname, sa = res
                sock = None
                try:
                    sock = socket.socket(af, socktype, proto)
                    if timeout is not socket._GLOBAL_DEFAULT_TIMEOUT:
                        sock.settimeout(timeout)
                    if source_address:
                        sock.bind(source_address)
                    sock.connect(sa)
                    return sock
                except Exception as _:
                    err = _
                    if sock is not None:
                        sock.close()
            if err is not None:
                raise err
            else:
                raise Exception("getaddrinfo returns an empty list")

    def force_native_str(s, encoding=None):
        if isinstance(s, unicode):
            return s.encode(encoding or 'utf-8')
        elif not isinstance(s, str):
            return str(s)
        else:
            return s

HTTPError = urllibr.HTTPError
URLError = urllibr.URLError
request_host = urllibr.request_host
parse_http_list = urllibr.parse_http_list


class SSLError(HTTPError):
    "Raised when SSL certificate fails in an HTTPS connection."
    pass

# ###################################################    URI & IRI SUFF
#
# The reserved URI characters (RFC 3986 - section 2.2)
# Default is charset is "iso-8859-1" (latin-1) from section 3.7.1
# http://www.ietf.org/rfc/rfc2616.txt
DEFAULT_CHARSET = 'ISO-8859-1'
URI_GEN_DELIMS = frozenset(':/?#[]@')
URI_SUB_DELIMS = frozenset("!$&'()*+,;=")
URI_RESERVED_SET = URI_GEN_DELIMS.union(URI_SUB_DELIMS)
URI_RESERVED_CHARS = ''.join(URI_RESERVED_SET)
# The unreserved URI characters (RFC 3986 - section 2.3)
URI_UNRESERVED_SET = frozenset(ascii_letters + string.digits + '-._~')
URI_SAFE_CHARS = URI_RESERVED_CHARS + '%~'
HEADER_TOKEN_CHARS = frozenset("!#$%&'*+-.0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
                               '^_`abcdefghijklmnopqrstuvwxyz|~')

escape = lambda s: quote(s, safe='~')
urlquote = lambda iri: quote(iri, safe=URI_RESERVED_CHARS)


def _gen_unquote(uri):
    unreserved_set = URI_UNRESERVED_SET
    for n, part in enumerate(force_native_str(uri, 'latin1').split('%')):
        if not n:
            yield part
        else:
            h = part[0:2]
            if len(h) == 2:
                c = chr(int(h, 16))
                if c in unreserved_set:
                    yield c + part[2:]
                else:
                    yield '%' + part
            else:
                yield '%' + part


def unquote_unreserved(uri):
    """Un-escape any percent-escape sequences in a URI that are unreserved
characters. This leaves all reserved, illegal and non-ASCII bytes encoded."""
    return ''.join(_gen_unquote(uri))


def requote_uri(uri):
    """Re-quote the given URI.

    This function passes the given URI through an unquote/quote cycle to
    ensure that it is fully and consistently quoted.
    """
    # Unquote only the unreserved characters
    # Then quote only illegal characters (do not quote reserved, unreserved,
    # or '%')
    return quote(unquote_unreserved(uri), safe=URI_SAFE_CHARS)


def iri_to_uri(iri, kwargs=None):
    '''Convert an Internationalised Resource Identifier (IRI) portion
    to a URI portion that is suitable for inclusion in a URL.
    This is the algorithm from section 3.1 of RFC 3987.
    Returns an ASCII native string containing the encoded result.
    '''
    if iri is None:
        return iri
    if kwargs:
        iri = '%s?%s' % (force_native_str(iri, 'latin1'),
                         '&'.join(('%s=%s' % kv for kv in iteritems(kwargs))))
    return urlquote(unquote_unreserved(iri))


def host_and_port(host):
    host, port = splitport(host)
    return host, int(port) if port else None


def default_port(scheme):
    if scheme in ("http", "ws"):
        return '80'
    elif scheme in ("https", "wss"):
        return '443'


def host_and_port_default(scheme, host):
    host, port = splitport(host)
    if not port:
        port = default_port(scheme)
    return host, port


def host_no_default_port(scheme, netloc):
    host, port = splitport(netloc)
    if port and port == default_port(scheme):
        return host
    else:
        return netloc


def get_hostport(scheme, full_host):
    host, port = host_and_port(full_host)
    if port is None:
        i = host.rfind(':')
        j = host.rfind(']')         # ipv6 addresses have [...]
        if i > j:
            try:
                port = int(host[i+1:])
            except ValueError:
                if host[i+1:] == "":  # http://foo.com:/ == http://foo.com/
                    port = default_port(scheme)
                else:
                    raise httpclient.InvalidURL("nonnumeric port: '%s'"
                                                % host[i+1:])
            host = host[:i]
        else:
            port = default_port(scheme)
        if host and host[0] == '[' and host[-1] == ']':
            host = host[1:-1]
    return host, int(port)


def remove_double_slash(route):
    if '//' in route:
        route = re.sub('/+', '/', route)
    return route


# ###################################################    CONTENT TYPES
JSON_CONTENT_TYPES = ('application/json',
                      'application/javascript',
                      'text/json',
                      'text/x-json')
# ###################################################    REQUEST METHODS
ENCODE_URL_METHODS = frozenset(['DELETE', 'GET', 'HEAD', 'OPTIONS'])
ENCODE_BODY_METHODS = frozenset(['PATCH', 'POST', 'PUT', 'TRACE'])
REDIRECT_CODES = (301, 302, 303, 305, 307)


def has_empty_content(status, method=None):
    '''204, 304 and 1xx codes have no content'''
    if status == httpclient.NO_CONTENT or\
            status == httpclient.NOT_MODIFIED or\
            100 <= status < 200 or\
            method == "HEAD":
        return True
    else:
        return False


def is_succesful(status):
    '''2xx status is succesful'''
    return status >= 200 and status < 300


# ###################################################    HTTP HEADERS
WEBSOCKET_VERSION = (8, 13)
HEADER_FIELDS = {'general': frozenset(('Cache-Control', 'Connection', 'Date',
                                       'Pragma', 'Trailer',
                                       'Transfer-Encoding',
                                       'Upgrade', 'Sec-WebSocket-Extensions',
                                       'Sec-WebSocket-Protocol',
                                       'Via', 'Warning')),
                 # The request-header fields allow the client to pass
                 # additional information about the request, and about the
                 # client to the server.
                 'request': frozenset(('Accept', 'Accept-Charset',
                                       'Accept-Encoding', 'Accept-Language',
                                       'Authorization',
                                       'Cookie', 'Expect', 'From',
                                       'Host', 'If-Match', 'If-Modified-Since',
                                       'If-None-Match', 'If-Range',
                                       'If-Unmodified-Since', 'Max-Forwards',
                                       'Proxy-Authorization', 'Range',
                                       'Referer',
                                       'Sec-WebSocket-Key',
                                       'Sec-WebSocket-Version',
                                       'TE',
                                       'User-Agent',
                                       'X-Requested-With')),
                 # The response-header fields allow the server to pass
                 # additional information about the response which cannot be
                 # placed in the Status- Line.
                 'response': frozenset(('Accept-Ranges',
                                        'Age',
                                        'ETag',
                                        'Location',
                                        'Proxy-Authenticate',
                                        'Retry-After',
                                        'Sec-WebSocket-Accept',
                                        'Server',
                                        'Set-Cookie',
                                        'Set-Cookie2',
                                        'Vary',
                                        'WWW-Authenticate',
                                        'X-Frame-Options')),
                 'entity': frozenset(('Allow', 'Content-Encoding',
                                      'Content-Language', 'Content-Length',
                                      'Content-Location', 'Content-MD5',
                                      'Content-Range', 'Content-Type',
                                      'Expires', 'Last-Modified'))}

CLIENT_HEADER_FIELDS = HEADER_FIELDS['general'].union(
    HEADER_FIELDS['entity'], HEADER_FIELDS['request'])
SERVER_HEADER_FIELDS = HEADER_FIELDS['general'].union(
    HEADER_FIELDS['entity'], HEADER_FIELDS['response'])
ALL_HEADER_FIELDS = CLIENT_HEADER_FIELDS.union(SERVER_HEADER_FIELDS)
ALL_HEADER_FIELDS_DICT = dict(((k.lower(), k) for k in ALL_HEADER_FIELDS))
CRLF = '\r\n'
LWS = '\r\n '
TYPE_HEADER_FIELDS = {'client': CLIENT_HEADER_FIELDS,
                      'server': SERVER_HEADER_FIELDS,
                      'both': ALL_HEADER_FIELDS}

header_type = {0: 'client', 1: 'server', 2: 'both'}
header_type_to_int = dict(((v, k) for k, v in header_type.items()))


def capheader(name):
    return '-'.join((b for b in (capfirst(n) for n in name.split('-')) if b))


def header_field(name, HEADERS_SET=None, strict=False):
    '''Return a header `name` in Camel case.

    For example::

        header_field('connection') == 'Connection'
        header_field('accept-charset') == 'Accept-Charset'

    If ``header_set`` is given, only return headers included in the set.
    '''
    name = name.lower()
    if name.startswith('x-'):
        return capheader(name)
    else:
        header = ALL_HEADER_FIELDS_DICT.get(name)
        if header and HEADERS_SET:
            return header if header in HEADERS_SET else None
        elif header:
            return header
        elif not strict:
            return capheader(name)


#    HEADERS UTILITIES
HEADER_FIELDS_JOINER = {'Cookie': '; ',
                        'Set-Cookie': None,
                        'Set-Cookie2': None}


def split_comma(value):
    return [v for v in (v.strip() for v in value.split(',')) if v]


def parse_cookies(value):
    return [c.OutputString() for c in SimpleCookie(value).values()]


header_parsers = {'Connection': split_comma,
                  'Cookie': parse_cookies}


def header_values(header, value):
    value = native_str(value)
    assert isinstance(value, str)
    if header in header_parsers:
        return header_parsers[header](value)
    else:
        return [value]


def quote_header_value(value, extra_chars='', allow_token=True):
    """Quote a header value if necessary.

    :param value: the value to quote.
    :param extra_chars: a list of extra characters to skip quoting.
    :param allow_token: if this is enabled token values are returned
        unchanged.
    """
    value = force_native_str(value)
    if allow_token:
        token_chars = HEADER_TOKEN_CHARS | set(extra_chars)
        if set(value).issubset(token_chars):
            return value
    return '"%s"' % value.replace('\\', '\\\\').replace('"', '\\"')


def unquote_header_value(value, is_filename=False):
    """Unquotes a header value.

    Reversal of :func:`quote_header_value`. This does not use the real
    un-quoting but what browsers are actually using for quoting.

    :param value: the header value to unquote.
    """
    if value and value[0] == value[-1] == '"':
        # this is not the real unquoting, but fixing this so that the
        # RFC is met will result in bugs with internet explorer and
        # probably some other browsers as well.  IE for example is
        # uploading files with "C:\foo\bar.txt" as filename
        value = value[1:-1]
        # if this is a filename and the starting characters look like
        # a UNC path, then just return the value without quotes.  Using the
        # replace sequence below on a UNC path has the effect of turning
        # the leading double slash into a single slash and then
        # _fix_ie_filename() doesn't work correctly.  See #458.
        if not is_filename or value[:2] != '\\\\':
            return value.replace('\\\\', '\\').replace('\\"', '"')
    return value


def parse_dict_header(value):
    """Parse lists of key, value pairs as described by RFC 2068 Section 2 and
    convert them into a python dict:

    >>> d = parse_dict_header('foo="is a fish", bar="as well"')
    >>> type(d) is dict
    True
    >>> sorted(d.items())
    [('bar', 'as well'), ('foo', 'is a fish')]

    If there is no value for a key it will be `None`:

    >>> parse_dict_header('key_without_value')
    {'key_without_value': None}

    To create a header from the :class:`dict` again, use the
    :func:`dump_header` function.

    :param value: a string with a dict header.
    :return: :class:`dict`
    """
    result = {}
    for item in parse_http_list(value):
        if '=' not in item:
            result[item] = None
            continue
        name, value = item.split('=', 1)
        if value[:1] == value[-1:] == '"':
            value = unquote_header_value(value[1:-1])
        result[name] = value
    return result


[docs]class Headers(object): '''Utility for managing HTTP headers for both clients and servers. It has a dictionary like interface with few extra functions to facilitate the insertion of multiple header values. Header fields are **case insensitive**, therefore doing:: >>> h = Headers() >>> h['Content-Length'] = '1050' is equivalent to >>> h['content-length'] = '1050' :param headers: optional iterable over header field/value pairs. :param kind: optional headers type, one of ``server``, ``client`` or ``both``. :param strict: if ``True`` only valid headers field will be included. This :class:`Headers` container maintains an ordering as suggested by http://www.w3.org/Protocols/rfc2616/rfc2616.html: .. epigraph:: The order in which header fields with differing field names are received is not significant. However, it is "good practice" to send general-header fields first, followed by request-header or response-header fields, and ending with the entity-header fields. -- rfc2616 section 4.2 The strict parameter is rarely used and it forces the omission on non-standard header fields. ''' def __init__(self, headers=None, kind='server', strict=False): if isinstance(kind, int): kind = header_type.get(kind, 'both') else: kind = str(kind).lower() self.kind = kind self.strict = strict self.all_headers = TYPE_HEADER_FIELDS.get(self.kind) if not self.all_headers: self.kind = 'both' self.all_headers = TYPE_HEADER_FIELDS[self.kind] self._headers = {} if headers is not None: self.update(headers) def __repr__(self): return '%s %s' % (self.kind, self._headers.__repr__()) def __str__(self): return '\r\n'.join(self._ordered()) def __bytes__(self): return str(self).encode(DEFAULT_CHARSET) def __len__(self): return len(self._headers) @property def kind_number(self): return header_type_to_int.get(self.kind)
[docs] def update(self, iterable): """Extend the headers with an ``iterable``. :param iterable: a dictionary or an iterable over keys, vaues tuples. """ for key, value in mapping_iterator(iterable): self.add_header(key, value)
[docs] def override(self, iterable): '''Extend headers by overriding fields form iterable. :param iterable: a dictionary or an iterable over keys, vaues tuples. ''' seen = set() for key, value in mapping_iterator(iterable): key = key.lower() if key in seen: self.add_header(key, value) else: seen.add(key) self[key] = value
def copy(self): return self.__class__(self, kind=self.kind, strict=self.strict) def __contains__(self, key): return header_field(key) in self._headers def __getitem__(self, key): key = header_field(key) values = self._headers[key] joiner = HEADER_FIELDS_JOINER.get(key, ', ') if joiner is None: joiner = '; ' return joiner.join(values) def __delitem__(self, key): self._headers.__delitem__(header_field(key)) def __setitem__(self, key, value): key = header_field(key, self.all_headers, self.strict) if key and value: if not isinstance(value, list): value = header_values(key, value) self._headers[key] = value
[docs] def get(self, key, default=None): '''Get the field value at ``key`` as comma separated values. For example:: >>> from pulsar.utils.httpurl import Headers >>> h = Headers(kind='client') >>> h.add_header('accept-encoding', 'gzip') >>> h.add_header('accept-encoding', 'deflate') >>> h.get('accept-encoding') results in:: 'gzip, deflate' ''' if key in self: return self.__getitem__(key) else: return default
[docs] def get_all(self, key, default=None): '''Get the values at header ``key`` as a list rather than a string separated by comma (which is returned by the :meth:`get` method). For example:: >>> from pulsar.utils.httpurl import Headers >>> h = Headers(kind='client') >>> h.add_header('accept-encoding', 'gzip') >>> h.add_header('accept-encoding', 'deflate') >>> h.get_all('accept-encoding') results in:: ['gzip', 'deflate'] ''' return self._headers.get(header_field(key), default)
[docs] def has(self, field, value): '''Check if ``value`` is avialble in header ``field``.''' value = value.lower() for c in self.get_all(field, ()): if c.lower() == value: return True return False
def pop(self, key, *args): return self._headers.pop(header_field(key), *args)
[docs] def clear(self): '''Same as :meth:`dict.clear`, it removes all headers. ''' self._headers.clear()
[docs] def getheaders(self, key): # pragma nocover '''Required by cookielib in python 2. If the key is not available, it returns an empty list. ''' return self._headers.get(header_field(key), [])
[docs] def add_header(self, key, values): '''Add ``values`` to ``key`` header. If the header is already available, append the value to the list. :param key: header name :param values: a string value or a list/tuple of strings values for header ``key`` ''' key = header_field(key, self.all_headers, self.strict) if key and values: if not isinstance(values, (tuple, list)): values = header_values(key, values) current = self._headers.get(key, []) for value in values: if value and value not in current: current.append(value) self._headers[key] = current
[docs] def remove_header(self, key, value=None): '''Remove the header at ``key``. If ``value`` is provided, it removes only that value if found. ''' key = header_field(key, self.all_headers, self.strict) if key: if value: value = value.lower() values = self._headers.get(key, []) removed = None for v in values: if v.lower() == value: removed = v values.remove(v) self._headers[key] = values return removed else: return self._headers.pop(key, None)
[docs] def flat(self, version, status): '''Full headers bytes representation''' vs = version + (status, self) return ('HTTP/%s.%s %s\r\n%s' % vs).encode(DEFAULT_CHARSET)
def __iter__(self): dj = ', ' for k, values in iteritems(self._headers): joiner = HEADER_FIELDS_JOINER.get(k, dj) if joiner: yield k, joiner.join(values) else: for value in values: yield k, value def _ordered(self): hf = HEADER_FIELDS hj = HEADER_FIELDS_JOINER dj = ', ' order = (('general', []), ('request', []), ('response', []), ('entity', [])) headers = self._headers for key in headers: for name, group in order: if key in hf[name]: group.append(key) break if key not in group: # non-standard header group.append(key) for _, group in order: for k in group: joiner = hj.get(k, dj) if not joiner: for header in headers[k]: yield "%s: %s" % (k, header) else: yield "%s: %s" % (k, joiner.join(headers[k])) yield '' yield ''
############################################################################### # HTTP PARSER ############################################################################### METHOD_RE = re.compile("[A-Z0-9$-_.]{3,20}") VERSION_RE = re.compile("HTTP/(\d+).(\d+)") STATUS_RE = re.compile("(\d{3})\s*(\w*)") HEADER_RE = re.compile("[\x00-\x1F\x7F()<>@,;:\[\]={} \t\\\\\"]") # errors BAD_FIRST_LINE = 0 INVALID_HEADER = 1 INVALID_CHUNK = 2 class InvalidRequestLine(Exception): """ error raised when first line is invalid """ class InvalidHeader(Exception): """ error raised on invalid header """ class InvalidChunkSize(Exception): """ error raised when we parse an invalid chunk size """
[docs]class HttpParser(object): '''A python HTTP parser. Original code from https://github.com/benoitc/http-parser 2011 (c) Benoit Chesneau <benoitc@e-engura.org> ''' def __init__(self, kind=2, decompress=False, method=None): self.decompress = decompress # errors vars self.errno = None self.errstr = "" # protected variables self._buf = [] self._version = None self._method = method self._status_code = None self._status = None self._reason = None self._url = None self._path = None self._query_string = None self._kind = kind self._fragment = None self._headers = OrderedDict() self._chunked = False self._body = [] self._trailers = None self._partial_body = False self._clen = None self._clen_rest = None # private events self.__on_firstline = False self.__on_headers_complete = False self.__on_message_begin = False self.__on_message_complete = False self.__decompress_obj = None @property def kind(self): return self._kind def get_version(self): return self._version def get_method(self): return self._method def get_status_code(self): return self._status_code def get_url(self): return self._url def get_path(self): return self._path def get_query_string(self): return self._query_string def get_fragment(self): return self._fragment def get_headers(self): return self._headers
[docs] def recv_body(self): """ return last chunk of the parsed body""" body = b''.join(self._body) self._body = [] self._partial_body = False return body
[docs] def is_headers_complete(self): """ return True if all headers have been parsed. """ return self.__on_headers_complete
[docs] def is_partial_body(self): """ return True if a chunk of body have been parsed """ return self._partial_body
[docs] def is_message_begin(self): """ return True if the parsing start """ return self.__on_message_begin
[docs] def is_message_complete(self): """ return True if the parsing is done (we get EOF) """ return self.__on_message_complete
[docs] def is_chunked(self): """ return True if Transfer-Encoding header value is chunked""" return self._chunked
def execute(self, data, length): # end of body can be passed manually by putting a length of 0 if length == 0: self.__on_message_complete = True return length # data = bytes(data) # start to parse nb_parsed = 0 while True: if not self.__on_firstline: idx = data.find(b'\r\n') if idx < 0: self._buf.append(data) return len(data) else: self.__on_firstline = True self._buf.append(data[:idx]) first_line = native_str(b''.join(self._buf), DEFAULT_CHARSET) rest = data[idx+2:] data = b'' if self._parse_firstline(first_line): nb_parsed = nb_parsed + idx + 2 self._buf = [rest] else: return nb_parsed elif not self.__on_headers_complete: if data: self._buf.append(data) data = b'' try: to_parse = b''.join(self._buf) ret = self._parse_headers(to_parse) if ret is False: return length nb_parsed = nb_parsed + (len(to_parse) - ret) except InvalidHeader as e: self.errno = INVALID_HEADER self.errstr = str(e) return nb_parsed elif not self.__on_message_complete: self.__on_message_begin = True if data: self._buf.append(data) data = b'' ret = self._parse_body() if ret is None: return length elif ret < 0: return ret elif ret == 0: self.__on_message_complete = True return length else: nb_parsed = max(length, ret) else: return 0 def _parse_firstline(self, line): try: if self.kind == 2: # auto detect try: self._parse_request_line(line) except InvalidRequestLine: self._parse_response_line(line) elif self.kind == 1: self._parse_response_line(line) elif self.kind == 0: self._parse_request_line(line) except InvalidRequestLine as e: self.errno = BAD_FIRST_LINE self.errstr = str(e) return False return True def _parse_response_line(self, line): bits = line.split(None, 1) if len(bits) != 2: raise InvalidRequestLine(line) # version matchv = VERSION_RE.match(bits[0]) if matchv is None: raise InvalidRequestLine("Invalid HTTP version: %s" % bits[0]) self._version = (int(matchv.group(1)), int(matchv.group(2))) # status matchs = STATUS_RE.match(bits[1]) if matchs is None: raise InvalidRequestLine("Invalid status %" % bits[1]) self._status = bits[1] self._status_code = int(matchs.group(1)) self._reason = matchs.group(2) def _parse_request_line(self, line): bits = line.split(None, 2) if len(bits) != 3: raise InvalidRequestLine(line) # Method if not METHOD_RE.match(bits[0]): raise InvalidRequestLine("invalid Method: %s" % bits[0]) self._method = bits[0].upper() # URI self._url = bits[1] parts = urlsplit('http://dummy.com%s' % bits[1]) self._path = parts.path or "" self._query_string = parts.query or "" self._fragment = parts.fragment or "" # Version match = VERSION_RE.match(bits[2]) if match is None: raise InvalidRequestLine("Invalid HTTP version: %s" % bits[2]) self._version = (int(match.group(1)), int(match.group(2))) def _parse_headers(self, data): if data == b'\r\n': self.__on_headers_complete = True self._buf = [] return 0 idx = data.find(b'\r\n\r\n') if idx < 0: # we don't have all headers return False chunk = native_str(data[:idx], DEFAULT_CHARSET) # Split lines on \r\n keeping the \r\n on each line lines = deque(('%s\r\n' % line for line in chunk.split('\r\n'))) # Parse headers into key/value pairs paying attention # to continuation lines. while len(lines): # Parse initial header name : value pair. curr = lines.popleft() if curr.find(":") < 0: continue name, value = curr.split(":", 1) name = name.rstrip(" \t").upper() if HEADER_RE.search(name): raise InvalidHeader("invalid header name %s" % name) name, value = header_field(name.strip()), [value.lstrip()] # Consume value continuation lines while len(lines) and lines[0].startswith((" ", "\t")): value.append(lines.popleft()) value = ''.join(value).rstrip() if name in self._headers: self._headers[name].append(value) else: self._headers[name] = [value] # detect now if body is sent by chunks. clen = self._headers.get('Content-Length') if 'Transfer-Encoding' in self._headers: te = self._headers['Transfer-Encoding'][0].lower() self._chunked = (te == 'chunked') else: self._chunked = False # status = self._status_code if status and (status == httpclient.NO_CONTENT or status == httpclient.NOT_MODIFIED or 100 <= status < 200 or # 1xx codes self._method == "HEAD"): clen = 0 elif clen is not None: try: clen = int(clen[0]) except ValueError: clen = None else: if clen < 0: # ignore nonsensical negative lengths clen = None # if clen is None: self._clen_rest = sys.maxsize else: self._clen_rest = self._clen = clen # # detect encoding and set decompress object if self.decompress and 'Content-Encoding' in self._headers: encoding = self._headers['Content-Encoding'][0] if encoding == "gzip": self.__decompress_obj = zlib.decompressobj(16+zlib.MAX_WBITS) elif encoding == "deflate": self.__decompress_obj = zlib.decompressobj() rest = data[idx+4:] self._buf = [rest] self.__on_headers_complete = True self.__on_message_begin = True return len(rest) def _parse_body(self): data = b''.join(self._buf) # if not self._chunked: # if not data and self._clen is None: if not self._status: # message complete only for servers self.__on_message_complete = True else: if self._clen_rest is not None: self._clen_rest -= len(data) # maybe decompress if self.__decompress_obj is not None: data = self.__decompress_obj.decompress(data) self._partial_body = True if data: self._body.append(data) self._buf = [] if self._clen_rest <= 0: self.__on_message_complete = True return else: try: size, rest = self._parse_chunk_size(data) except InvalidChunkSize as e: self.errno = INVALID_CHUNK self.errstr = "invalid chunk size [%s]" % str(e) return -1 if size == 0: return size if size is None or len(rest) < size + 2: return None body_part, rest = rest[:size], rest[size:] if self.__decompress_obj is not None: body_part = self.__decompress_obj.decompress(body_part) self._partial_body = True self._body.append(body_part) rest = rest[2:] self._buf = [rest] if rest else [] return len(rest) + 2 def _parse_chunk_size(self, data): idx = data.find(b'\r\n') if idx < 0: return None, None line, rest_chunk = data[:idx], data[idx+2:] chunk_size = line.split(b';', 1)[0].strip() try: chunk_size = int(chunk_size, 16) except ValueError: raise InvalidChunkSize(chunk_size) if chunk_size == 0: self._parse_trailers(rest_chunk) return 0, None return chunk_size, rest_chunk def _parse_trailers(self, data): idx = data.find(b'\r\n\r\n') if data[:2] == b'\r\n': self._trailers = self._parse_headers(data[:idx])
if not hasextensions: # pragma nocover setDefaultHttpParser(HttpParser) # ############################################ UTILITIES, ENCODERS, PARSERS def get_environ_proxies(): """Return a dict of environment proxies. From requests_.""" proxy_keys = [ 'all', 'http', 'https', 'ftp', 'socks', 'ws', 'wss', 'no' ] get_proxy = lambda k: os.environ.get(k) or os.environ.get(k.upper()) proxies = [(key, get_proxy(key + '_proxy')) for key in proxy_keys] return dict([(key, val) for (key, val) in proxies if val]) def appendslash(url): '''Append a slash to *url* if it does not have one.''' if not url.endswith('/'): url = '%s/' % url return url def choose_boundary(): """Our embarassingly-simple replacement for mimetools.choose_boundary.""" return uuid4().hex def get_content_type(filename): return mimetypes.guess_type(filename)[0] or 'application/octet-stream' def encode_multipart_formdata(fields, boundary=None, charset=None): """Encode a dictionary of ``fields`` using the multipart/form-data format. :param fields: Dictionary of fields or list of (key, value) field tuples. The key is treated as the field name, and the value as the body of the form-data bytes. If the value is a tuple of two elements, then the first element is treated as the filename of the form-data section. Field names and filenames must be unicode. :param boundary: If not specified, then a random boundary will be generated using :func:`mimetools.choose_boundary`. """ charset = charset or 'utf-8' body = BytesIO() if boundary is None: boundary = choose_boundary() for fieldname, value in mapping_iterator(fields): body.write(('--%s\r\n' % boundary).encode(charset)) if isinstance(value, tuple): filename, data = value body.write(('Content-Disposition: form-data; name="%s"; ' 'filename="%s"\r\n' % (fieldname, filename)) .encode(charset)) body.write(('Content-Type: %s\r\n\r\n' % (get_content_type(filename))).encode(charset)) else: data = value body.write(('Content-Disposition: form-data; name="%s"\r\n' % (fieldname)).encode(charset)) body.write(b'Content-Type: text/plain\r\n\r\n') data = body.write(to_bytes(data)) body.write(b'\r\n') body.write(('--%s--\r\n' % (boundary)).encode(charset)) content_type = 'multipart/form-data; boundary=%s' % boundary return body.getvalue(), content_type def hexmd5(x): return md5(to_bytes(x)).hexdigest() def hexsha1(x): return sha1(to_bytes(x)).hexdigest() def http_date(epoch_seconds=None): """ Formats the time to match the RFC1123 date format as specified by HTTP RFC2616 section 3.3.1. Accepts a floating point number expressed in seconds since the epoch, in UTC - such as that outputted by time.time(). If set to None, defaults to the current time. Outputs a string in the format 'Wdy, DD Mon YYYY HH:MM:SS GMT'. """ return formatdate(epoch_seconds, usegmt=True) # ################################################################# COOKIE def create_cookie(name, value, **kwargs): """Make a cookie from underspecified parameters. By default, the pair of `name` and `value` will be set for the domain '' and sent on every request (this is sometimes called a "supercookie"). """ result = dict( version=0, name=name, value=value, port=None, domain='', path='/', secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False,) badargs = set(kwargs) - set(result) if badargs: err = 'create_cookie() got unexpected keyword arguments: %s' raise TypeError(err % list(badargs)) result.update(kwargs) result['port_specified'] = bool(result['port']) result['domain_specified'] = bool(result['domain']) result['domain_initial_dot'] = result['domain'].startswith('.') result['path_specified'] = bool(result['path']) return Cookie(**result) def cookiejar_from_dict(*cookie_dicts): """Returns a CookieJar from a key/value dictionary. :param cookie_dict: Dict of key/values to insert into CookieJar. """ jars = [] cookie_dicts = tuple((d for d in cookie_dicts if d)) if len(cookie_dicts) == 1 and isinstance(cookie_dicts[0], CookieJar): return cookie_dicts[0] cookiejar = CookieJar() for cookie_dict in cookie_dicts: if isinstance(cookie_dict, CookieJar): for cookie in cookie_dict: cookiejar.set_cookie(cookie) else: for name in cookie_dict: cookiejar.set_cookie(create_cookie(name, cookie_dict[name])) return cookiejar cc_delim_re = re.compile(r'\s*,\s*') def patch_vary_headers(response, newheaders): """\ Adds (or updates) the "Vary" header in the given HttpResponse object. newheaders is a list of header names that should be in "Vary". Existing headers in "Vary" aren't removed. For information on the Vary header, see: http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.44 """ # Note that we need to keep the original order intact, because cache # implementations may rely on the order of the Vary contents in, say, # computing an MD5 hash. if 'Vary' in response: vary_headers = cc_delim_re.split(response['Vary']) else: vary_headers = [] # Use .lower() here so we treat headers as case-insensitive. existing_headers = set([header.lower() for header in vary_headers]) additional_headers = [newheader for newheader in newheaders if newheader.lower() not in existing_headers] response['Vary'] = ', '.join(vary_headers + additional_headers) def has_vary_header(response, header_query): """ Checks to see if the response has a given header name in its Vary header. """ if not response.has_header('Vary'): return False vary_headers = cc_delim_re.split(response['Vary']) existing_headers = set([header.lower() for header in vary_headers]) return header_query.lower() in existing_headers class CacheControl(object): ''' http://www.mnot.net/cache_docs/ .. attribute:: maxage Specifies the maximum amount of time that a representation will be considered fresh. ''' def __init__(self, maxage=None, private=False, must_revalidate=False, proxy_revalidate=False, nostore=False): self.maxage = maxage self.private = private self.must_revalidate = must_revalidate self.proxy_revalidate = proxy_revalidate self.nostore = nostore def __call__(self, headers): if self.nostore: headers['cache-control'] = ('no-store, no-cache, must-revalidate,' ' max-age=0') elif self.maxage: headers['cache-control'] = 'max-age=%s' % self.maxage if self.private: headers.add_header('cache-control', 'private') else: headers.add_header('cache-control', 'public') if self.must_revalidate: headers.add_header('cache-control', 'must-revalidate') elif self.proxy_revalidate: headers.add_header('cache-control', 'proxy-revalidate') else: headers['cache-control'] = 'no-cache'