# coding='UTF-8' from bs4 import BeautifulSoup # 引入beautifulsoup 解析html事半功倍 import re import urllib import urllib.request import sys import io import json from collections import deque import time sys.stdout = io.TextIOWrapper( sys.stdout.buffer, encoding='utf8') # 改變標准輸出的默認編碼(這個比較重要一點,可以有效解決編碼異常)def gethtml(soup): data = soup.find_all("img") for x in data: path = "k:/asd/" + '%s.jpg' % time.time() fileurl = x.get("src") print(fileurl) try: urllib.request.urlretrieve(fileurl, path) except: pass url = "http://www.toutiao.com/" queue = deque() visited = set() cnt = 0 queue.append(url)
while queue: url = queue.popleft() # 隊首元素出隊 visited |= {url} # 標記為已訪問 print('已經抓取: ' + str(cnt) + ' 正在抓取 <--- ' + url) cnt += 1 try: urlop = urllib.request.urlopen(url) except: continue try: html = urlop.read().decode() except: pass soup = BeautifulSoup(html) data = gethtml(soup) # print(data) for x in soup.find_all('a'): # 這里提現引入beautifulsoup 的方便之處 可以直接解析html 拿到elm
#這個是beautifulsoup 文檔可以看下 https://www.crummy.com/software/BeautifulSoup/bs4/doc/index.zh.html#id37 try: if 'http' in x.get("href") and x.get("href") not in visited: queue.append(x.get("href")) print('加入隊列 ---> ' + x.get("href")) except: pass print("----------------------end-------------------")
# coding='UTF-8'from bs4 import BeautifulSoupimport reimport urllibimport urllib.requestimport sysimport ioimport jsonfrom collections import dequeimport time
sys.stdout = io.TextIOWrapper( sys.stdout.buffer, encoding='utf8') # 改變標准輸出的默認編碼
imgqu = deque()imvli = set()
def gethtml(soup): data = soup.find_all("img") for x in data: path = "k:/asd/" + '%s.jpg' % time.time() fileurl = x.get("src") print(fileurl) try: urllib.request.urlretrieve(fileurl, path) except: pass
url = "http://www.toutiao.com/"queue = deque()visited = set()cnt = 0
queue.append(url)
while queue: url = queue.popleft() # 隊首元素出隊 visited |= {url} # 標記為已訪問
print('已經抓取: ' + str(cnt) + ' 正在抓取 <--- ' + url) cnt += 1
try: urlop = urllib.request.urlopen(url) except: continue
try: html = urlop.read().decode() except: pass soup = BeautifulSoup(html) data = gethtml(soup) # print(data)
for x in soup.find_all('a'): try: if 'http' in x.get("href") and x.get("href") not in visited: queue.append(x.get("href")) print('加入隊列 ---> ' + x.get("href"))
except: pass
print("----------------------end-------------------")