爬取豆瓣音樂TOP250的數據


參考網址:https://music.douban.com/top250

因為詳細頁的信息更豐富,本次爬蟲在詳細頁中進行,因此先爬取進入詳細頁的網址鏈接,進而爬取數據。

需要爬取的信息有:歌曲名、表演者、流派、發行時間、出版者和評分等。

將數據分別使用TXT、JSON、CSV存儲。

import re
import csv
import time
import json
import requests
from bs4 import BeautifulSoup
from requests import RequestException
    
    
def get_one_page(url):
    try:
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36' 
                   + '(KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'}
        response = requests.get(url, headers=headers)
        #response.encoding = response.apparent_encoding
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None

def get_detailurl(text):
    detailurl = []
    soup = BeautifulSoup(text, 'lxml')
    nbg = soup.find_all(name='a', class_='nbg')
    for i in nbg:
        detailurl.append(i['href'])
    return detailurl
    
def parse_one_page(text):
    soup = BeautifulSoup(text, 'lxml') #使用lxml XML 解析庫
    performer = soup.select('#info > span > span > a')
    #有標簽沒有屬性的情況下用select, 否則用find_all
    song = soup.select('#wrapper > h1 > span')
    
    style = re.findall('流派:</span> (.*?)<br', text, re.S) #.*?非貪婪匹配
    #re.S使.匹配包括換行在內的所有字符
    if len(style) == 0: #有的頁面沒有流派,用NULL填充
        style.append('NULL')
    publisher = re.findall('出版者:</span> (.*?)?<br', text, re.S)
    if len(publisher) == 0: #有的頁面沒有出版者,用NULL填充
        publisher.append('NULL')
    pattern = re.compile('發行時間:</span> (.*?)?<br', re.S)
    #compile()將正則字符串編譯成正則對象,方便之后復用
    time = re.findall(pattern, text)
    
    score = soup.find_all(name='strong', class_="ll rating_num")
    #有標簽有屬性的情況下用find_all
    yield {
        'performer': performer[0].string,
        'song': song[0].string,
        'style': style[0].strip(),
        'time': time[0].strip(),
        'publisher': publisher[0].strip(),
        'score': score[0].string,
    }
        
def write_to_file(content):
    with open('doubanMusicTop250.txt', 'a', encoding='utf-8') as f:
        f.write(json.dumps(content, ensure_ascii=False)+'\n')
        #dumps將json對象轉化為字符串

def write_to_json(content):
    with open('doubanMusicTop250.json', 'a', encoding='utf-8') as f:
        f.write(json.dumps(content, ensure_ascii=False)+'\n')
        
def write_to_csv(content):
    with open('doubanMusicTop250.csv', 'a', encoding='utf-8') as f:
        fieldnames = ['publisher', 'style', 'song', 'score', 'performer', 'time']
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(content)

if __name__ == '__main__':
    url = 'https://music.douban.com/top250?start={}'
    urls = [url.format(page) for page in range(0,256,25)]
    content = []
    for url in urls:
        text1 = get_one_page(url)
        detailurl = get_detailurl(text1)
        for i in detailurl:
            text2 = get_one_page(i)
            for item in parse_one_page(text2):          
                print(item)
                write_to_file(item)            
                content.append(item)
        time.sleep(1)
    write_to_csv(content)
    write_to_json(content)


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM