python -使用Requests庫完成Post表單操作


"""
使用Requests庫完成Post表單操作
"""
#_*_codingn:utf8 _*_
import requests

from bs4 import BeautifulSoup

'''
  設置請求頭,讓程序發出的請求更像來源於瀏覽器
'''
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36(KHTML, like Gecko) Chrome/43.0.2357.124 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8"}

if __name__ == "__main__":

    params ={"username": "anything","password": "password"}

    session =requests.session()
    post_obj = session.post("http://pythonscraping.com/pages/cookies/welcome.php", params)

    s = session.get("http://pythonscraping.com/pages/cookies/profile.php")
    print(post_obj.text.encode("utf-8"))
    print(s.text.encode("utf-8"))

    #session.cookies.get_dict()  #獲取cooking
    print(session.cookies.get_dict())

 

# -*- coding: utf-8 -*-
'''
目標站點分析
網頁結構分析
--開干--
1、單頁內容
2、正則
3、保存json
4、多線程循環
'''
# .*具有貪婪的性質,首先匹配到不能匹配為止,根據后面的正則表達式,會進行回溯。
# .*?(短)則相反,一個匹配以后,就往下進行,所以不會進行回溯,具有最小匹配的性質。
# re.S 讓.匹配換行符
#----------------------------------
import json
import requests
from requests.exceptions import RequestException
import re
import time
from multiprocessing import Pool

headers = {  # 非常重要
    'Accept-Language': 'en-US,en;q=0.8',
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36',
    'Connection': 'keep-alive',
    'Referer': 'http://maoyan.com/board/6'
}

def get_one_page(url):
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.text
        return None  # 非200
    except RequestException:
        return None

def parse_one_page(html):
    pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a'
                         + '.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>'
                         + '.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>', re.S)
    items = re.findall(pattern, html)
    for item in items:
        yield {  # 變成生成器
            'index': item[0],
            'image': item[1],
            'title': item[2],
            'actor': item[3].strip()[3:],  # 字符串處理 (移除字符串頭尾指定的字符序列)
            'time': item[4].strip()[5:],
            'score': item[5] + item[6]  # 分開匹配加起來
        }

def write_to_file(content):
    with open('result.txt', 'a', encoding='utf-8') as f:  # 編碼3
        f.write(json.dumps(content, ensure_ascii=False) + '\n')  # json.dumps 序列化時對中文默認使用的ascii編碼

def main(offset):
    url = 'http://maoyan.com/board/4?offset=' + str(offset)
    html = get_one_page(url)  # return返回參數
    # print(html)
    for item in parse_one_page(html):
        # print(item)
        write_to_file(item)


if __name__ == '__main__':
    for i in range(10):
        main(offset=i * 10)
        time.sleep(1)
    # 進程池
    # pool=Pool()

    # pool.map(main,[i*10 for i in range(10)])
# coding=utf-8

'''
1、抓取索引頁內容
2、抓取詳情頁內容
3、下載圖片保存數據庫
4、循環及多線程
'''

import requests
from requests.exceptions import RequestException
from json import loads
from bs4 import BeautifulSoup
user_agent = "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)"
headers = {"User-Agent": user_agent}


def get_onepage_index(i, keywords):
    data = {
        "offset": i,
        "format": "json",
        "keyword": keywords,
        "autoload": "true",
        "count": "20",
        "cur_tab": "1",
        "from": "search_tab"
    }
    url = 'https://www.toutiao.com/search_content/?'
    try:
        response = requests.get(url, params=data)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        print('something is wrong!')
        return None


def parse_onepage_index(html):
    # json.loads()用於將str類型的數據轉成dict。
    data = loads(html)
    if data and 'data' in data.keys():  ##獲取所有的key 值
        for item in data.get('data'):  # get() 函數返回指定鍵的值,如果值不在字典中返回默認值。
            yield item.get('article_url')


def get_page_detail(url):
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            # print(response.status_code)
            return response.text
        return None
    except RequestException:
        print('wrong url:', url)
        return None


def parsepage(html):
    soup = BeautifulSoup(html, 'lxml')
    title = soup.title.string
    print(title)


def main():
    for i in range(1, 2):
        i = str(i * 20)
        html = get_onepage_index(i, '街拍')
        parse_onepage_index(html)
        for url in parse_onepage_index(html):
            print(url)
            detailhtml = get_page_detail(url)  # 返回網頁文本
            # print(detailhtml)
            if detailhtml == None:
                pass
            else:
                parsepage(detailhtml)  # bs4去解析


# get_page_detail('http://toutiao.com/group/6596305324645286404/')

if __name__ == '__main__':
      main()

 

如有疑問,請留言。

如覺得有幫助,請點個贊,謝謝!


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM