1.首先,下載需要的模塊requests, BeautifulSoup, datetime, pymysql(注意,因為我用的python3.7,不支持mysqldb了),具體的下載方法有pip下載,或者使用Anaconda版本python的童鞋可以使用conda下載。
2.創建conndb,py,包含數據庫的連接斷開,增刪改查等操作:
#!/usr/bin/env python # -*- coding:utf-8 -*- import pymysql def conn_db(): # 連接數據庫函數 conn = pymysql.connect( host='localhost', user='數據庫用戶名', passwd='數據庫密碼', db='數據庫名稱', charset='utf8') cur = conn.cursor() return conn, cur def exe_update(cur, sql): # 更新語句,可執行update,insert語句 sta = cur.execute(sql) return sta def exe_delete(cur, ids): # 刪除語句,可批量刪除 for eachID in ids.split(' '): sta = cur.execute('delete from cms where id =%d' % int(eachID)) return sta def exe_query(cur, sql): # 查詢語句 cur.execute(sql) return cur def exe_commit(cur): cur.connection.commit() # 執行commit操作,插入語句才能生效 def conn_close(conn, cur): # 關閉所有連接 cur.close() conn.close()
3.創建另一個python文件,用於抓取數據(注意引入conndb.py文件):
#!/usr/bin/env python # -*- coding:utf-8 -*- import requests from bs4 import BeautifulSoup import datetime import conndb def get_html_text(url): try: r = requests.get(url, timeout=30) # r.encoding = r.apparent_encoding r.encoding = 'utf-8' # 編碼方式 # print(r.text) return r.text except BaseException as e: print('BaseException:', e) return "" def get_content(url): html = get_html_text(url) # print(html) soup = BeautifulSoup(html, 'html.parser') title = soup.select(".list-point > .item") # 此處為BeautifulSoup的CSS選擇器,括號內為通過類名選擇 today = datetime.datetime.now().strftime('%Y-%m-%d') # 獲取今天的日期,用於抓取新聞時判斷抓取今天的內容 for item in title: time = item.find('span').string # 新聞創建日期 time1 = '20' + time[0: time.index(" ")] # 日期字符串格式處理,便於比較 if time1 == today: # 新聞的創建日期是今天 url = item.find('a')['href'] # 獲取單條新聞鏈接,用戶單條新聞抓取 title = item.find('a').string # print(title + time + url) get_new_content(url, title, time) def get_new_content(url, title, tim1): html = get_html_text(url) # print(html) soup = BeautifulSoup(html, 'html.parser') p = soup.select(".article-content > p") # print(p) # for item in p: # if item.find('img'): # print(item.find('img')['src']) # else: # print(item.string) # 調用更新記錄的函數 p_str = str(p) # p為標簽,要轉化為字符串,並去掉前后的[]符號 length = len(p_str) utf8_length = len(p_str.encode('utf-8')) length = (utf8_length - length) / 2 + length p_str = p_str[1: int(length)] tim2 = datetime.datetime.strptime('20' + tim1, '%Y-%m-%d %H:%M') # 將字符串格式的日期轉為數據庫要求的datetime sta = conndb.exe_update(cur, "insert into cms(title, content, gmt_create) " "values('%s','%s','%s')" % (title, p_str, tim2)) if sta == 1: print('插入成功') else: print('插入失敗') def main(): url = "抓取的頁面url" get_content(url) # 調用連接數據庫的函數 conn, cur = conndb.conn_db() main() conndb.exe_commit(cur) # 注意!! 一定要記得commit,否則操作成功了,但是並沒有添加到數據庫中 conndb.conn_close(conn, cur)
這樣,抓取到的數據就可以保存到數據庫中了。