先放上url,https://music.douban.com/chart
這是豆瓣的一個音樂排行榜,這里爬取了左邊部分的歌曲排行榜部分,爬蟲很簡單,就用到了beautifulsoup和requests這兩個庫,爬取后分吧把內容存儲到txt,csv和數據庫
0x01:存儲到txt
import requests from bs4 import BeautifulSoup url = 'https://music.douban.com/chart' html = requests.get(url=url).text soup = BeautifulSoup(html, 'lxml') ul = soup.find(attrs={'class': 'col5'}) # 找到存放排行榜音樂的ul標簽 lis = ul.find_all(name='li') # 獲取每一首音樂對應的標簽 for li in lis: paiming = li.find(name='span').string name = li.find(name='a', attrs={'href': 'javascript:;'}).string a = li.find(name='a', attrs={'class': 'face'}) # 獲取存存放連接的a標簽,再在a標簽里獲取連接 if a != None: img = a.find(name='img') # 獲取連接的img標簽 lianjie = img.attrs['src'] # 獲取到連接,下面進行一個判斷,前十首歌曲有鏈接,后十首沒有 else: lianjie = '沒有鏈接' print(paiming, name, lianjie) with open('dou_bai_music.txt', 'a', encoding='utf-8') as f: f.write(paiming+'\t'+name+'\t'+lianjie+'\n') f.close()
0x02:存儲到csv
import requests from bs4 import BeautifulSoup import csv url = 'https://music.douban.com/chart' html = requests.get(url=url).text soup = BeautifulSoup(html, 'lxml') ul = soup.find(attrs={'class': 'col5'}) # 找到存放排行榜音樂的ul標簽 lis = ul.find_all(name='li') # 獲取每一首音樂對應的標簽 with open('doubai.csv', 'a', newline='', encoding='utf-8') as f: # 打開文件,寫入表頭 write = csv.writer(f) write.writerow(['排名', '歌名', '鏈接']) for li in lis: paiming = li.find(name='span').string name = li.find(name='a', attrs={'href': 'javascript:;'}).string a = li.find(name='a', attrs={'class': 'face'}) # 獲取存存放連接的a標簽,再在a標簽里獲取連接 if a != None: img = a.find(name='img') # 獲取連接的img標簽 lianjie = img.attrs['src'] # 獲取到連接 else: lianjie = '沒有鏈接' print(paiming, name, lianjie) with open('doubai.csv', 'a', newline='', encoding='utf-8') as f: # 打開文件,存儲爬取內容 write = csv.writer(f) write.writerow([paiming, name, lianjie])
0x03:存儲到數據庫
import requests from bs4 import BeautifulSoup import pymysql url = 'https://music.douban.com/chart' html = requests.get(url=url).text soup = BeautifulSoup(html, 'lxml') ul = soup.find(attrs={'class': 'col5'}) # 找到存放排行榜音樂的ul標簽 lis = ul.find_all(name='li') # 獲取每一首音樂對應的標簽 db = pymysql.connect(host='localhost', user='root', password='root', port=3306, db='spiders') # 連接數據庫 cursor = db.cursor() print('數據庫連接成功') sql = 'CREATE TABLE IF NOT EXISTS music (paiming INT NOT NULL, name VARCHAR(255) NOT NULL, lianjie VARCHAR(255) NOT NULL, PRIMARY KEY (paiming))' # 創建數據表,指定字段 cursor.execute(sql) print('數據表創建完成!') for li in lis: paiming = li.find(name='span').string name = li.find(name='a', attrs={'href': 'javascript:;'}).string a = li.find(name='a', attrs={'class': 'face'}) # 獲取存存放連接的a標簽,再在a標簽里獲取連接 if a != None: img = a.find(name='img') # 獲取連接的img標簽 lianjie = img.attrs['src'] # 獲取到連接 else: lianjie = '沒有鏈接' print(paiming, name, lianjie) sql = 'INSERT INTO music(paiming, name, lianjie) values(%s, %s, %s)' # 插入數據,判斷是否插入成功 try: cursor.execute(sql, (paiming, name, lianjie)) db.commit() print('數據插入完成!!') except: print('插入失敗') db.rollback() db.close() # 關閉數據庫
*******************************不積跬步,無以至里。******************************