今天老師講解了Python中的爬蟲框架--scrapy,然后帶領我們做了一個小爬蟲--爬取51job網的職位信息,並且保存到數據庫中
用的是Python3.6 pycharm編輯器
爬蟲主體:
import scrapy from ..items import JobspidersItem class JobsspiderSpider(scrapy.Spider): name = 'jobsspider' #allowed_domains = ['search.51job.com/list/010000,000000,0000,00,9,99,%2520,2,1.html'] #start_urls = ['https://search.51job.com/list/010000,000000,0000,00,9,99,%2520,2,1.html/'] start_urls = [ 'https://search.51job.com/list/010000,000000,0000,01,9,99,python,2,1.html?lang=c&stype=&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&providesalary=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='] def parse(self, response): currentPageItems = response.xpath('/html/body/div[@class="dw_wp"]/div[@class="dw_table"]/div[@class="el"]') print(currentPageItems) # currentPageItems = response.xpath('//div[@class="el"]') for jobItem in currentPageItems: print('----',jobItem) jobspidersItem = JobspidersItem() jobPosition = jobItem.xpath('p[@class="t1 "]/span/a/text()').extract() if jobPosition: #print(jobPosition[0].strip()) jobspidersItem['jobPosition'] = jobPosition[0].strip() jobCompany = jobItem.xpath('span[@class="t2"]/a/text()').extract() if jobCompany: #print(jobCompany[0].strip()) jobspidersItem['jobCompany'] = jobCompany[0].strip() jobArea = jobItem.xpath('span[@class="t3"]/text()').extract() if jobArea: #print(jobArea[0].strip()) jobspidersItem['jobArea'] = jobArea[0].strip() jobSale = jobItem.xpath('span[@class="t4"]/text()').extract() if jobSale: # print(jobCompany[0].strip()) jobspidersItem['jobSale'] = jobSale[0].strip() jobDate = jobItem.xpath('span[@class="t5"]/text()').extract() if jobDate: # print(jobCompany[0].strip()) jobspidersItem['jobDate'] = jobDate[0].strip() yield jobspidersItem # 通過yield 調用輸出管道 pass nextPageURL = response.xpath('//li[@class="bk"]/a/@href').extract() # 取下一頁的地址 print(nextPageURL) if nextPageURL: url = response.urljoin(nextPageURL[-1]) print('url', url) # 發送下一頁請求並調用parse()函數繼續解析 yield scrapy.Request(url, self.parse, dont_filter=False) pass else: print("退出") pass
items.py 設置五個items
import scrapy class JobspidersItem(scrapy.Item): # define the fields for your item here like: jobPosition = scrapy.Field() jobCompany = scrapy.Field() jobArea = scrapy.Field() jobSale = scrapy.Field() jobDate = scrapy.Field() pass
pipelines.py 輸出管道
class JobspidersPipeline(object): def process_item(self, item, spider): print('職位:', item['jobPosition']) print('公司:', item['jobCompany']) print('工作地點:', item['jobArea']) print('薪資:', item['jobSale']) print('發布時間:', item['jobDate']) print('----------------------------') return item
pipelinesmysql.py 輸出到mysql中 第一行的意思是使用了以前封裝的數據庫操作類
from week5_day04.dbutil import dbutil # 作業: 自定義的管道,將完整的爬取數據,保存到MySql數據庫中 class JobspidersPipeline(object): def process_item(self, item, spider): dbu = dbutil.MYSQLdbUtil() dbu.getConnection() # 開啟事物 # 1.添加 try: #sql = "insert into jobs (職位名,公司名,工作地點,薪資,發布時間)values(%s,%s,%s,%s,%s)" sql = "insert into t_job (jobname,jobcompany,jobarea,jobsale,jobdata)values(%s,%s,%s,%s,%s)" #date = [] #dbu.execute(sql, date, True) dbu.execute(sql, (item['jobPosition'],item['jobCompany'],item['jobArea'],item['jobSale'],item['jobDate']),True) #dbu.execute(sql,True) dbu.commit() print('插入數據庫成功!!') except: dbu.rollback() dbu.commit() # 回滾后要提交 finally: dbu.close() return item
最終結果:
通過這個最基礎的51job爬蟲,進入到scrapy框架的學習中,這東西挺好使