【python】獲取http響應

本文轉載自查看原文 2017-10-09 17:01 3980 python

一個相對完整的http請求，輸入ip和端口，輸出響應碼，響應頭，響應體，是否超時，以及出錯時的錯誤信息

處理包括：

1.協議處理，如果是443用https，其他用http

2.HTTPError處理，HTTPError一般是401,403,404之類的錯誤，雖然報錯，但是也有響應頭。注意獲取錯誤信息時要用str(e)，其他的比如repr(e)得到的不是字符串，e.read()是響應體，不是錯誤原因

3.URLError處理，一般是Connection refused之類的錯誤。注意獲取錯誤信息時要用str(e.reason)

4.響應體gzip解壓

5.響應體編碼轉換

# coding=utf8

import urllib2
import chardet
import traceback
import StringIO
import re
import gzip


def plugin_homepage(data, timeout):
    ip = data["ip"]
    port = data["port"]
    if port == 443:
        url = "https://%s:%s/" % (ip, port)
    else:
        url = "http://%s:%s/" % (ip, port)
    is_timeout, error_reason, code, header, body, title = get_html(url, timeout)
    res = {"ip": ip,
           "port": port,
           "rsp_header": header,
           "rsp_body": body,
           "code": code,
           "title": title,
           "is_timeout": is_timeout,
           "error_reason": error_reason}
    return res


def get_html(url, timeout):
    user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
    headers = {'User-Agent': user_agent}
    is_timeout = False
    error_reason = None
    code = None
    header = None
    body = None
    title = None
    try:
        request = urllib2.Request(url, headers=headers)
        response = urllib2.urlopen(request, timeout=timeout)
        code = response.getcode()
        body = response.read()
        header = str(response.headers)
    except urllib2.HTTPError, e:   # 處理http錯誤
        # print "str(e):%s\nrepr(e):%s\ne:%s\ne.read():%s\n" % (str(e), repr(e), e, e.read())
        error_reason = str(e)
        body = e.read()
        header = e.headers
    except urllib2.URLError, e:
        print traceback.print_exc()
        error_reason = str(e.reason)
        if error_reason == "timed out":  # 判斷是否超時
            is_timeout = True
        return is_timeout, error_reason, code, header, body, title
    except Exception, e:
        print traceback.print_exc()
        error_reason = str(e)
        return is_timeout, error_reason, code, header, body, title
    if not header:
        return is_timeout, error_reason, code, header, body, title
    # 解壓gzip
    if 'Content-Encoding' in header and 'gzip' in header['Content-Encoding']:
        html_data = StringIO.StringIO(body)
        gz = gzip.GzipFile(fileobj=html_data)
        body = gz.read()
    # 編碼轉換
    try:
        html_encode = get_encode(header, body).strip()
        if html_encode and len(html_encode) < 12:
            body = body.decode(html_encode).encode('utf-8')
    except:
        pass
    # 獲取title
    try:
        title = re.search(r'<title>(.*?)</title>', body, flags=re.I | re.M)
        if title:
            title = title.group(1)
    except:
        pass
    return is_timeout, error_reason, code, str(header), body, title


# 獲取html編碼
def get_encode(header, body):
    try:
        m = re.search(r'<meta.*?charset=(.*?)"(>| |/)', body, flags=re.I)
        if m:
            return m.group(1).replace('"', '')
    except:
        pass
    try:
        if 'Content-Type' in header:
            Content_Type = header['Content-Type']
            m = re.search(r'.*?charset=(.*?)(;|$)', Content_Type, flags=re.I)
            if m:
                return m.group(1)
    except:
        pass
    chardit1 = chardet.detect(body)
    encode_method = chardit1['encoding']
    return encode_method


if __name__ == "__main__":
    data = {"ip": "127.0.0.1", "port": 80}
    res = plugin_homepage(data, 3)
    print res

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 獲取http響應時間，pycurl,python Python使用pycurl獲取http的響應時間 python獲取http請求響應頭headers中的數據 Python【HTTP響應狀態碼】 Python爬蟲(一)_HTTP的請求與響應 fetch獲取http的response響應碼 Vuejs之axios獲取Http響應頭 js獲取http請求響應頭信息 HTTP 請求/響應設置/獲取 Header參數 Java獲取Http響應Header信息