Python爬虫学习第一天--利用selenium和chromedriver驱动浏览器爬取网页


 1 #!/usr/bin/env python
 2 # -*- coding: utf-8 -*-
 3 # @Time : 2018/7/12 21:10
 4 # @Author : chenxiaowei
 5 # @Email : chen1020xiaowei@163.com
 6 # @File : vip.py
 7 
 8 from pymongo.errors import ConfigurationError  9 from selenium import webdriver  10 from selenium.common.exceptions import TimeoutException, WebDriverException  11 from selenium.webdriver.common.by import By  12 from selenium.webdriver.support.ui import WebDriverWait  13 from selenium.webdriver.support import expected_conditions as EC  14 from pyquery import PyQuery  15 from urllib3.exceptions import NewConnectionError, MaxRetryError  16 from config_vip import *
 17 from multiprocessing import Pool  18 from selenium.webdriver.chrome.options import Options  19 import os  20 import pymongo  21 import requests  22 import hashlib  23 import time  24 
 25 if browser_method == 0:  26     browser = webdriver.Chrome()  27     print('你选择使用Chrome()方法...')  28 elif browser_method == 1:  29     browser = webdriver.PhantomJS(service_args=['--load-images=false', '--disk-cache=false'])  30     print('你选择使用PhantomJS()方法...')  31 else:  32     chrome_option = Options()  33     chrome_option.add_argument('--headless')  34     browser = webdriver.Chrome(options=chrome_option)  35     print('你选择使用Headless()方法...')  36 
 37 browser.set_window_size(1920, 1080)  38 wait = WebDriverWait(browser, 10)  39 
 40 try:  41     client = pymongo.MongoClient(mongo_url)  42     database = client[mongo_database]  43 except TypeError:  44     print('数据库创建失败'.center(130, '*'))  45 except ConfigurationError:  46     print('数据库创建失败'.center(130, '*'))  47 
 48 
 49 # 实现数据库对象
 50 
 51 def drop_down_scrollbar():  52     # 定义下拉滚动条方法
 53     times = 1
 54     while times < total_times:  55         js = "var q=document.documentElement.scrollTop={}".format(times * size)  56  browser.execute_script(js)  57         time.sleep(1)  58         times += 1
 59 
 60 
 61 def get_search(search_word):  62     # 定义get_()search方法
 63     url = main_url  64  browser.get(url)  65     # 打开url,获得内容
 66     time.sleep(3)  67     try:  68         search_bar = wait.until(  69             EC.presence_of_element_located((By.CSS_SELECTOR, '#J-search > div.c-search-form > input')))  70         enter_button = wait.until(  71             EC.element_to_be_clickable((By.CSS_SELECTOR, '#J-search > div.c-search-form > a > span')))  72         # 确定输入框和搜索按钮可用
 73  search_bar.send_keys(search_word)  74         time.sleep(1)  75  enter_button.click()  76         # 输入关键字并点击搜索
 77         time.sleep(5)  78  drop_down_scrollbar()  79         pages = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, '#J_pagingCt > a:nth-child(6)')))  80         # 获得总页数,main()中作为for循环参数
 81         print('搜索到{}共{}页的内容'.format(search_word, pages.text))  82         time.sleep(3)  83         print('开始获取{}第{}页的内容...'.format(search_word, str(1)))  84  get_page_detail(search_word)  85         print('完成获取{}第{}页的内容...'.format(search_word, str(1)))  86         return pages.text  87     except TimeoutException:  88         print('网页未加载完成,无法搜索信息!', TimeoutException.args)  89         pass
 90     except WebDriverException:  91         print(WebDriverException.args)  92         pass
 93 
 94 
 95 def get_next_page(search_word, page):  96     # 定义get_next_page()方法进行跳转
 97     try:  98         url1 = url_search.format(search_word, str(page))  99         # 找出网页规律,定个模板
100         print('开始获取{}第{}页的内容...\n'.format(search_word, page)) 101  browser.get(url1) 102  drop_down_scrollbar() 103  get_page_detail(search_word) 104         print('完成获取{}第{}页的内容...\n'.format(search_word, page)) 105     except TimeoutException: 106         print('跳转网页超时!', TimeoutException.args) 107         pass
108     except WebDriverException: 109         print(WebDriverException.args) 110         pass
111 
112 
113 def get_page_detail(search_word): 114     # 定义get_page_detail()方法获取网页详细信息
115     try: 116         source = browser.page_source 117         html = PyQuery(source) 118         print('解析数据成功'.center(130, '*')) 119         # PyQuery解析源代码
120         good_items = html('.goods-list .goods-list-item').items() 121         # 调用items()方法获得数据
122         for item in good_items: 123             goods = { 124                 'good-title': item.find('.goods-title-info ').text().split('\n')[1], 125                 'good-sells-price': item.find('.goods-info .goods-price-wrapper .goods-sells-price .price').text(), 126                 'good-market-price': item.find('.goods-info .goods-market-price').text()[2:], 127                 'good-discount': item.find('.goods-info .goods-discount').text(), 128                 'good-brand': item.find('.goods-info .goods-brand').text(), 129                 'image': 'http:{}'.format(item.find('.goods-slide .goods-image-link .J_img').attr('src')), 130                 'detail': 'http:{}'.format(item.find(' .goods-slide .goods-image-link').attr('href')) 131  } 132             image_url = goods['image'] 133             content = get_image_content(image_url) 134             if content: 135                 # 确定图片网页是否可以打开
136  download_image(content, search_word, image_url) 137  save_to_mongodb(goods, search_word) 138         # 调用find方法和CSS取得数据
139     except TimeoutException: 140         print('爬取网页超时!', TimeoutException.args) 141         pass
142 
143 
144 def save_to_mongodb(goods, database_table): 145     # 定义save_to_mongoDB(goods)方法将数据存储到mongoDB数据
146     try: 147         if database[database_table].insert(goods): 148             # 插入数据成功
149             print('存储数据成功'.center(130, '*')) 150             print(goods, '\n') 151     except Exception: 152         print('写入数据出错!', Exception.args) 153         pass
154 
155 
156 def get_image_content(url): 157     try: 158         response = requests.get(url) 159         if response.status_code == 200: 160             return response.content 161         else: 162             print('请求图片链接失败!') 163     except ConnectionError: 164         print(ConnectionError.args) 165         return False 166     except NewConnectionError: 167         print(NewConnectionError.args) 168         return False 169     except MaxRetryError: 170         print(MaxRetryError.args) 171         return False 172 
173 
174 def download_image(content, folder, image_url): 175     # 定义download_image(content)保存图片
176     time_stamp = time.strftime("%Y%m%d", time.localtime()) 177     path = file_path.format(mongo_database, time_stamp, folder) 178     if os.path.exists(path): 179         pass
180     else: 181  os.makedirs(path) 182     # 利用hash算法获得content MD5值以16进制显示
183     filename = hashlib.md5(content).hexdigest() 184     with open(file_type.format(path, filename), 'wb')as f: 185  f.write(content) 186  f.close() 187     # 打开文件保存路径,文件名,格式,wb写入形式
188     print(' {} 下载图片成功'.format(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())).center(125, '*')) 189     print(filename, image_url) 190 
191 
192 def main(search_word): 193     pages = int(get_search(search_word)) 194     page = 2
195     if pages >= end: 196         pages = end 197     try: 198         while page <= pages: 199  get_next_page(search_word, page) 200             page += 1
201     except TimeoutException: 202         print(TimeoutException.args) 203         pass
204 
205 
206 if __name__ == '__main__': 207     pool = Pool(processes=2) 208     pool.map(main, [keyword for keyword in keywords]) 209  pool.close() 210     #锁定进程池
211  pool.join() 212     os.system('taskkill /im chromedriver.exe /F') 213     os.system('taskkill /im chrome.exe /F') 214     #杀死多余的chromedriver进程以及chrome进程
 1 #!/usr/bin/env python
 2 # -*- coding: utf-8 -*-
 3 # @Time : 2018/7/12 23:48
 4 # @Author : chenxiaowei
 5 # @Email : chen1020xiaowei@163.com
 6 # @File : config_vip.py
 7 mongo_url = 'localhost'
 8 mongo_database = 'vip'
 9 #数据库地址以及名称
10 main_url = 'https://www.vip.com/'
11 
12 total_times =16
13 size =500
14 #设定下来滚动条的次数和大小
15 
16 browser_method = 2
17 #驱动浏览器的方法
18 
19 start=1
20 end = 45
21 #设定结束网页,有些网页没有内容,容易引起一场
22 url_search = 'https://category.vip.com/suggest.php?keyword={}&page={}&count=100&suggestType=brand#catPerPos'
23 #定义模板
24 file_path = 'H:/Python_download/{}/{}/image/{}/'
25 file_type = '{}{}.jpg'
26 # 文件类型以及文件夹
27 
28 
29 keywords=['苹果','雪梨','香蕉']

 


免责声明!

本站转载的文章为个人学习借鉴使用,本站对版权不负任何法律责任。如果侵犯了您的隐私权益,请联系本站邮箱yoyou2525@163.com删除。



 
粤ICP备18138465号  © 2018-2025 CODEPRJ.COM