python3.6 簡單爬蟲

本文轉載自查看原文 2017-03-23 19:14 7054

# coding='UTF-8'
from bs4 import BeautifulSoup  # 引入beautifulsoup 解析html事半功倍
import re
import urllib
import urllib.request
import sys
import io
import json
from collections import deque
import time


sys.stdout = io.TextIOWrapper(
    sys.stdout.buffer, encoding='utf8')  # 改變標准輸出的默認編碼（這個比較重要一點，可以有效解決編碼異常）def gethtml(soup):
    data = soup.find_all("img")
    for x in data:
        path = "k:/asd/" + '%s.jpg' % time.time()
        fileurl = x.get("src")
        print(fileurl)
        try:
            urllib.request.urlretrieve(fileurl, path)
        except:
            pass


url = "http://www.toutiao.com/"
queue = deque()
visited = set()
cnt = 0

queue.append(url)


while queue:
    url = queue.popleft()  # 隊首元素出隊
    visited |= {url}  # 標記為已訪問

    print('已經抓取: ' + str(cnt) + '   正在抓取 <---  ' + url)
    cnt += 1

    try:
        urlop = urllib.request.urlopen(url)
    except:
        continue

    try:
        html = urlop.read().decode()
    except:
        pass
    soup = BeautifulSoup(html)
    data = gethtml(soup)
    # print(data)

    for x in soup.find_all('a'):  # 這里提現引入beautifulsoup 的方便之處  可以直接解析html 拿到elm 
　　　　　　　　　　　　　　　　　　　　#這個是beautifulsoup 文檔可以看下　https://www.crummy.com/software/BeautifulSoup/bs4/doc/index.zh.html#id37 
        try:
            if 'http' in x.get("href") and x.get("href") not in visited:
                queue.append(x.get("href"))
                print('加入隊列 --->  ' + x.get("href"))

        except:
            pass

print("----------------------end-------------------")

# coding='UTF-8'from bs4 import BeautifulSoupimport reimport urllibimport urllib.requestimport sysimport ioimport jsonfrom collections import dequeimport time

sys.stdout = io.TextIOWrapper( sys.stdout.buffer, encoding='utf8') # 改變標准輸出的默認編碼
imgqu = deque()imvli = set()

def gethtml(soup): data = soup.find_all("img") for x in data: path = "k:/asd/" + '%s.jpg' % time.time() fileurl = x.get("src") print(fileurl) try: urllib.request.urlretrieve(fileurl, path) except: pass

url = "http://www.toutiao.com/"queue = deque()visited = set()cnt = 0
queue.append(url)
while queue: url = queue.popleft() # 隊首元素出隊 visited |= {url} # 標記為已訪問
print('已經抓取: ' + str(cnt) + ' 正在抓取 <--- ' + url) cnt += 1
try: urlop = urllib.request.urlopen(url) except: continue
try: html = urlop.read().decode() except: pass soup = BeautifulSoup(html) data = gethtml(soup) # print(data)
for x in soup.find_all('a'): try: if 'http' in x.get("href") and x.get("href") not in visited: queue.append(x.get("href")) print('加入隊列 ---> ' + x.get("href"))
except: pass
print("----------------------end-------------------")

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 python3.6 urllib.request庫實現簡單的網絡爬蟲、下載圖片華為雲照片的爬蟲程序更新(python3.6) Python3.6安裝 linux安裝python3.6 CentOS安裝python3.6 ubuntu安裝python3.6 mac安裝python3.6 Python3.6學習筆記（五） python3.6配置flask Python3.6 websocket開發