簡單python爬蟲實例


先放上url,https://music.douban.com/chart

這是豆瓣的一個音樂排行榜,這里爬取了左邊部分的歌曲排行榜部分,爬蟲很簡單,就用到了beautifulsoup和requests這兩個庫,爬取后分吧把內容存儲到txt,csv和數據庫

0x01:存儲到txt

import requests
from bs4 import BeautifulSoup
url = 'https://music.douban.com/chart'
html = requests.get(url=url).text
soup = BeautifulSoup(html, 'lxml')
ul = soup.find(attrs={'class': 'col5'})     # 找到存放排行榜音樂的ul標簽
lis = ul.find_all(name='li')                # 獲取每一首音樂對應的標簽
for li in lis:
    paiming = li.find(name='span').string
    name = li.find(name='a', attrs={'href': 'javascript:;'}).string
    a = li.find(name='a', attrs={'class': 'face'})      # 獲取存存放連接的a標簽,再在a標簽里獲取連接
    if a != None:
        img = a.find(name='img')    # 獲取連接的img標簽
        lianjie = img.attrs['src']      # 獲取到連接,下面進行一個判斷,前十首歌曲有鏈接,后十首沒有
    else:
        lianjie = '沒有鏈接'
    print(paiming, name, lianjie)

    with open('dou_bai_music.txt', 'a', encoding='utf-8') as f:
        f.write(paiming+'\t'+name+'\t'+lianjie+'\n')
        f.close()

0x02:存儲到csv

import requests
from bs4 import BeautifulSoup
import csv
url = 'https://music.douban.com/chart'
html = requests.get(url=url).text
soup = BeautifulSoup(html, 'lxml')
ul = soup.find(attrs={'class': 'col5'})     # 找到存放排行榜音樂的ul標簽
lis = ul.find_all(name='li')                # 獲取每一首音樂對應的標簽
with open('doubai.csv', 'a', newline='', encoding='utf-8') as f:    # 打開文件,寫入表頭
    write = csv.writer(f)
    write.writerow(['排名', '歌名', '鏈接'])
for li in lis:
    paiming = li.find(name='span').string
    name = li.find(name='a', attrs={'href': 'javascript:;'}).string
    a = li.find(name='a', attrs={'class': 'face'})      # 獲取存存放連接的a標簽,再在a標簽里獲取連接
    if a != None:
        img = a.find(name='img')    # 獲取連接的img標簽
        lianjie = img.attrs['src']      # 獲取到連接
    else:
        lianjie = '沒有鏈接'
    print(paiming, name, lianjie)
    with open('doubai.csv', 'a', newline='', encoding='utf-8') as f:  # 打開文件,存儲爬取內容
        write = csv.writer(f)
        write.writerow([paiming, name, lianjie])

0x03:存儲到數據庫

import requests
from bs4 import BeautifulSoup
import pymysql
url = 'https://music.douban.com/chart'
html = requests.get(url=url).text
soup = BeautifulSoup(html, 'lxml')
ul = soup.find(attrs={'class': 'col5'})     # 找到存放排行榜音樂的ul標簽
lis = ul.find_all(name='li')                # 獲取每一首音樂對應的標簽
db = pymysql.connect(host='localhost', user='root', password='root', port=3306, db='spiders')  # 連接數據庫
cursor = db.cursor()
print('數據庫連接成功')
sql = 'CREATE TABLE IF NOT EXISTS music (paiming INT NOT NULL, name VARCHAR(255) NOT NULL, lianjie VARCHAR(255) NOT NULL, PRIMARY KEY (paiming))'  # 創建數據表,指定字段
cursor.execute(sql)
print('數據表創建完成!')
for li in lis:
    paiming = li.find(name='span').string
    name = li.find(name='a', attrs={'href': 'javascript:;'}).string
    a = li.find(name='a', attrs={'class': 'face'})      # 獲取存存放連接的a標簽,再在a標簽里獲取連接
    if a != None:
        img = a.find(name='img')    # 獲取連接的img標簽
        lianjie = img.attrs['src']      # 獲取到連接
    else:
        lianjie = '沒有鏈接'
    print(paiming, name, lianjie)
    sql = 'INSERT INTO music(paiming, name, lianjie) values(%s, %s, %s)'    # 插入數據,判斷是否插入成功
    try:
        cursor.execute(sql, (paiming, name, lianjie))
        db.commit()
        print('數據插入完成!!')
    except:
        print('插入失敗')
        db.rollback()
db.close()      # 關閉數據庫

 

*******************************不積跬步,無以至里。******************************


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM