【python】獲取http響應


一個相對完整的http請求,輸入ip和端口,輸出響應碼,響應頭,響應體,是否超時,以及出錯時的錯誤信息

處理包括:

1.協議處理,如果是443用https,其他用http

2.HTTPError處理,HTTPError一般是401,403,404之類的錯誤,雖然報錯,但是也有響應頭。注意獲取錯誤信息時要用str(e),其他的比如repr(e)得到的不是字符串,e.read()是響應體,不是錯誤原因

3.URLError處理,一般是Connection refused之類的錯誤。注意獲取錯誤信息時要用str(e.reason)

4.響應體gzip解壓

5.響應體編碼轉換

 

# coding=utf8

import urllib2
import chardet
import traceback
import StringIO
import re
import gzip


def plugin_homepage(data, timeout):
    ip = data["ip"]
    port = data["port"]
    if port == 443:
        url = "https://%s:%s/" % (ip, port)
    else:
        url = "http://%s:%s/" % (ip, port)
    is_timeout, error_reason, code, header, body, title = get_html(url, timeout)
    res = {"ip": ip,
           "port": port,
           "rsp_header": header,
           "rsp_body": body,
           "code": code,
           "title": title,
           "is_timeout": is_timeout,
           "error_reason": error_reason}
    return res


def get_html(url, timeout):
    user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
    headers = {'User-Agent': user_agent}
    is_timeout = False
    error_reason = None
    code = None
    header = None
    body = None
    title = None
    try:
        request = urllib2.Request(url, headers=headers)
        response = urllib2.urlopen(request, timeout=timeout)
        code = response.getcode()
        body = response.read()
        header = str(response.headers)
    except urllib2.HTTPError, e:   # 處理http錯誤
        # print "str(e):%s\nrepr(e):%s\ne:%s\ne.read():%s\n" % (str(e), repr(e), e, e.read())
        error_reason = str(e)
        body = e.read()
        header = e.headers
    except urllib2.URLError, e:
        print traceback.print_exc()
        error_reason = str(e.reason)
        if error_reason == "timed out":  # 判斷是否超時
            is_timeout = True
        return is_timeout, error_reason, code, header, body, title
    except Exception, e:
        print traceback.print_exc()
        error_reason = str(e)
        return is_timeout, error_reason, code, header, body, title
    if not header:
        return is_timeout, error_reason, code, header, body, title
    # 解壓gzip
    if 'Content-Encoding' in header and 'gzip' in header['Content-Encoding']:
        html_data = StringIO.StringIO(body)
        gz = gzip.GzipFile(fileobj=html_data)
        body = gz.read()
    # 編碼轉換
    try:
        html_encode = get_encode(header, body).strip()
        if html_encode and len(html_encode) < 12:
            body = body.decode(html_encode).encode('utf-8')
    except:
        pass
    # 獲取title
    try:
        title = re.search(r'<title>(.*?)</title>', body, flags=re.I | re.M)
        if title:
            title = title.group(1)
    except:
        pass
    return is_timeout, error_reason, code, str(header), body, title


# 獲取html編碼
def get_encode(header, body):
    try:
        m = re.search(r'<meta.*?charset=(.*?)"(>| |/)', body, flags=re.I)
        if m:
            return m.group(1).replace('"', '')
    except:
        pass
    try:
        if 'Content-Type' in header:
            Content_Type = header['Content-Type']
            m = re.search(r'.*?charset=(.*?)(;|$)', Content_Type, flags=re.I)
            if m:
                return m.group(1)
    except:
        pass
    chardit1 = chardet.detect(body)
    encode_method = chardit1['encoding']
    return encode_method


if __name__ == "__main__":
    data = {"ip": "127.0.0.1", "port": 80}
    res = plugin_homepage(data, 3)
    print res

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM