爬蟲練習


爬蟲小項目

0、爬取大學排名

import bs4
import requests
from bs4 import BeautifulSoup


# 通過傳入網址信息創建一個獲取網頁文本的函數
def getHTMLText(url):
    # 判斷獲取網頁文本過程中是否有錯誤
    try:
        # 打開網址獲取文本,並且把延遲設置成30s
        r = requests.get(url, timeout=30)
        # 獲取狀態碼
        r.raise_for_status()
        # 設置文件編碼
        r.encoding = r.apparent_encoding
        # 如果成功返回網頁文本
        return r.text
    except:
        # 獲取網頁文本有錯誤則返回空文本
        return ""


# 通過傳入空列表和網頁文本信息創建一個在大學列表中加入大學信息的函數
def fillUnivList(ulist, html):
    # 用BeautifulSoup將網頁文本以’html.parser‘煮成一鍋粥
    soup = BeautifulSoup(html, "html.parser")
    # 通過網頁源代碼我們可以發現我們需要的信息都在tbody標簽內,因此我們循環找出’tbody‘標簽及其子標簽的內容
    for tr in soup.find('tbody').children:
        # 通過bs4.element.Tag判斷是否為tr標簽
        if isinstance(tr, bs4.element.Tag):
            # 對於tr標簽的我們拿到tr標簽里的td標簽
            tds = tr('td')
            # [<td>1</td>, <td><div align="left">清華大學</div></td>, <td>北京</td>, <td>95.3</td>...
            # 我們通過篩選出我們需要的td標簽中的文本並將其用列表的方式加入我們傳入的列表ulist中
            ulist.append([tds[0].string, tds[1].string,
                          tds[2].string, tds[3].string])


# 通過傳入學校列表信息創建一個打印大學列表的函數
def printUnivList(ulist,province):
    # 打印標題
    print("中國最好大學排名2018({}地區)".center(45, '-').format(province))
    # 設置一個format格式化的模板
    # 注意:這里的{4}是因為utf8格式的英文和中文字節數不同,python會自動用英文來填
    tplt = "{0:^10}\t{1:{4}^10}\t{2:^10}\t{3:^10}"
    # 充空白位置,我們用chr(12288)將其修改成用中文填充空白位置
    # 打印第一行
    print(tplt.format("排名", "學校名稱", "地區", "總分", chr(12288)))
    if province == '安徽':
        print(tplt.format(1, '安徽師范大學花津校區', '安徽', 99.9, chr(12288)))
    # 循環取出列表中的每一所大學的信息,取出的大學信息是列表的形式(可以控制range(len(ulist))的長度來控制想要打印的學校的數量)
    for i in range(len(ulist)):
        # 將每一所大學的信息以列表的形式賦值給u
        u = ulist[i]
        # u[2]是地區,判斷是否為安徽地區(可以自己更改地區信息,如果刪除該判斷,打印所有學校信息,也可以更改判斷條件)
        if u[2] == province:
            # 如果為安徽地區,我們打印屬於安徽地區的每一所大學的信息
            print(tplt.format(u[0], u[1], u[2], u[3], chr(12288)))


# 創建一個運行函數
def main(province='安徽'):
    # 創建一個空列表,為填充大學信息列表做准備
    uinfo = []
    # 定義一個想要爬取的網頁
    url = 'http://www.zuihaodaxue.cn/zuihaodaxuepaiming2018.html'
    # 傳入想要爬取的網頁獲取該網頁文本信息
    html = getHTMLText(url)
    # 給填充大學信息函數傳值
    fillUnivList(uinfo, html)
    # 給打印大學信息函數傳值
    printUnivList(uinfo,province=province)


main()
# main(province='北京')

1、爬取豆瓣250

import requests
import time
from openpyxl import Workbook
from bs4 import BeautifulSoup

wb = Workbook()
sheet = wb.active
count = 1
for i in range(0,100,25):
    ret = requests.get('https://movie.douban.com/top250?start=%s&filter='%(i))
    bs = BeautifulSoup(ret.text,'html.parser')
    ol = bs.find(name='ol',attrs={'class':'grid_view'})
    li_list = ol.find_all(name='li')
    sheet.title = '好評電影'
    sheet['A1'].value = '序號'
    sheet['B1'].value = '電影名稱'
    sheet['C1'].value = '電影評分'
    sheet['D1'].value = '電影鏈接'
    sheet['E1'].value = '電影圖片'
    for li in li_list:
        name = li.find(name='span',attrs={'class':'title'})
        a = li.find(name='a')
        span = li.find(name='span',attrs={'class':'rating_num'})
        img = a.find(name='img')
        count += 1
        sheet['A%s'%(count)].value = count - 1
        sheet['B%s'%(count)].value = name.text
        sheet['C%s'%(count)].value = span.text
        sheet['D%s'%(count)].value = a['href']
        sheet['E%s'%(count)].value = img['src']
    time.sleep(1)
wb.save('好評電影.xlsx')

2、爬取汽車之家

import requests
from bs4 import BeautifulSoup
from openpyxl import Workbook


def run(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
    }
    response = requests.get(url,headers=headers)
    response.encoding = 'gbk'
    soup = BeautifulSoup(response.text,'html.parser')
    # 獲取ul
    ul = soup.find(name='ul',attrs={"class":"article"})
    # 獲取所有的li
    li_list = ul.find_all(name='li')
    infos = []
    for li in li_list:
        name = li.find(name="h3")
        name1 = ""
        if name:
            name1 = (name.text)
        href = li.find(name='a')
        href1 = ""
        if href:
            href1 = ('http:'+href['href'])
        info = li.find(name='p')
        info1 = ""
        if info:
            info1 = (info.text)
        infos.append({"title":name1,"href":href1,"info":info1})
    print(infos)

if __name__ == '__main__':
    url = 'https://www.autohome.com.cn/news/'
    run(url)

3、爬取斗圖表情包

import requests
from bs4 import BeautifulSoup
ret = requests.get('https://www.doutula.com/photo/list?page=0')
bs = BeautifulSoup(ret.text,'html.parser')
div = bs.find(name='div',attrs={'class':'page-content text-center'})

a_list = div.find_all(name='a')
for a in a_list:
    img = a.find(name='img')
    img_name = img.get('alt')
    img_url = img.get('data-backup')

    if img_name and img_url:
           # print(img_name)
           # print(img_url)
           f = open('表情包/%s.jpg'%(img_name),'wb')
           ret_img = requests.get(img_url)
           f.write(ret_img.content)        

4、爬取梨視頻

import requests
import re
from bs4 import BeautifulSoup

ret = requests.get('https://www.pearvideo.com/')
print(ret.text)

bs = BeautifulSoup(ret.text,'html.parser')
div_list = bs.find_all(name='div',attrs={'class':'vervideo-tbd'})

num = 0
for div in div_list:
    a = div.find(name='a')
    video_url = 'https://www.pearvideo.com/' + a.get('href')
    video_ret = requests.get(video_url)

    mp4_url = re.search('(https:\/\/)[^\s]+mp4',video_ret.text).group()
    print(mp4_url)
    mp4_ret = requests.get(mp4_url)
    f = open('梨視頻%s.mp4'%(num),'wb')
    f.write(mp4_ret.content)
    num += 1

實現在線翻譯功能

import requests
import json
headers = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
}


def main(keys=''):
    url = 'http://fy.iciba.com/ajax.php?a=fy'
    data = {
        'f': 'auto',
        't': 'auto',
        'w': keys
    }
    response = requests.post(url,headers=headers,data=data)
    info = response.text
    data_list = json.loads(info)
    try:
        val = data_list['content']['word_mean'] # 中文轉英文
    except:
        val = data_list['content']['out']  # 英文轉中文
    return val

if __name__ == '__main__':
    keys = input('請輸入需要翻譯的英文或者中文...')
    if not keys:
          print('請您正確輸入需要翻譯的中文或者英文...')
    else:
        data = main(keys)
        print(data)

selenium小項目

開胃菜

# 百度搜索老男孩
from selenium import webdriver
# 打開瀏覽器
b = webdriver.Chrome()
# 請求百度
b.get('https://www.baidu.com')
# 找到百度的input輸入框的標識符 id:kw
ele = b.find_element_by_id('kw')
# 清除輸入框信息
ele.clear()
# 輸入 老男孩 
ele.send_keys('老男孩')
# 查找點擊按鈕節點
su = b.find_element_by_id('su')
# 點擊按鈕
su.click()

爬取京東商城

from selenium import webdriver
from selenium.webdriver.common.keys import Keys  # 鍵盤按鍵操作
import time

def get_goods(driver):
    try:
        goods = driver.find_elements_by_class_name('gl-item')

        for good in goods:
            detail_url = good.find_element_by_tag_name('a').get_attribute('href')

            p_name = good.find_element_by_css_selector('.p-name em').text.replace('\n','')
            price = good.find_element_by_css_selector('.p-price i').text
            p_commit = good.find_element_by_css_selector('.p-commit a').text
            msg = '''
            商品 : %s
            鏈接 : %s
            價錢 :%s
            評論 :%s
            ''' % (p_name, detail_url, price, p_commit)

            print(msg, end='\n\n')

        button = driver.find_element_by_partial_link_text('下一頁')
        button.click()
        time.sleep(1)
        get_goods(driver)
    except Exception:
        pass


def spider(url, keyword):
    driver = webdriver.Chrome()
    driver.get(url)
    driver.implicitly_wait(3)  # 使用隱式等待
    try:
        input_tag = driver.find_element_by_id('key')
        input_tag.send_keys(keyword)
        input_tag.send_keys(Keys.ENTER)
        get_goods(driver)
    finally:
        driver.close()


if __name__ == '__main__':
    spider('https://www.jd.com/', keyword='華為P30')

爬蟲與數據分析之雨女無瓜

import requests
from bs4 import BeautifulSoup
import datetime
import pandas as pd
import matplotlib.pyplot as plt
import re
import jieba
import numpy as np
from scipy.misc import imread
from wordcloud import WordCloud, ImageColorGenerator

url = "https://comment.bilibili.com/92542241.xml"
r = requests.get(url)
r.encoding = 'utf8'


soup = BeautifulSoup(r.text,'lxml')
d = soup.find_all('d')

dlst = []
n = 0
for i in d:
    n += 1
    danmuku = {}
    danmuku['彈幕'] = i.text
    danmuku['網址'] = url
    danmuku['時間'] = datetime.date.today()
    dlst.append(danmuku)

df = pd.DataFrame(dlst)

with open('sign.txt','w',encoding='utf8') as f:
    for text in df['彈幕'].values:
        pattern = re.compile(r'[一-龥]+')
        filter_data = re.findall(pattern,text)
        f.write("".join(filter_data))

with open('sign.txt', 'r', encoding='utf8') as f:
    data = f.read()
    segment = jieba.lcut(data)
    words_df = pd.DataFrame({"segment": segment})

word_stat = words_df.groupby(by=['segment'])['segment'].agg({'計數':np.size})
words_stat = word_stat.reset_index().sort_values(by=['計數'],ascending=False)

color_mask = imread('01.jpg')

wordcloud = WordCloud(
    # font_path="simhei.ttf",   # mac上沒有該字體
    font_path="C:\Windows\Fonts\simkai.ttf",
    # 設置字體可以顯示中文
    background_color="white",  # 背景顏色
    max_words=3000,  # 詞雲顯示的最大詞數
    mask=color_mask,  # 設置背景圖片
    max_font_size=200,  # 字體最大值
    random_state=100,
    width=1000, height=860, margin=2,
    # 設置圖片默認的大小,但是如果使用背景圖片的話,                                                   # 那么保存的圖片大小將會按照其大小保存,margin為詞語邊緣距離
)

# 生成詞雲, 可以用generate輸入全部文本,也可以我們計算好詞頻后使用generate_from_frequencies函數
word_frequence = {x[0]: x[1] for x in words_stat.head(500).values}
word_frequence_dict = {}
for key in word_frequence:
    word_frequence_dict[key] = word_frequence[key]

wordcloud.generate_from_frequencies(word_frequence_dict)
# 從背景圖片生成顏色值
# image_colors = ImageColorGenerator(color_mask)
# 重新上色
# wordcloud.recolor(color_func=image_colors)
# 保存圖片
wordcloud.to_file('output.png')
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM