XPath常用規則 / 從當前節點選取直接子節點 // 從當前節點選取子孫節點 . 選取當前節點 .. 選取當前節點的父節點 @ 選取屬性 * 通配符,選擇所有元素節點與元素名 @* 選取所有屬性 [@attrib] 選取具有給定屬性的所有元素 [@attrib='value'] 選取給定屬性具有給定值的所有元素 [tag] 選取所有具有指定元素的直接子節點 [tag='text'] 選取所有具有指定元素並且文本內容是text節點

"""爬取豆瓣網站的信息""" import requests from lxml import etree # 請求頭設置 headers = { "User-Agentv": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3554.0 Safari/537.36", "Referer": "https://movie.douban.com/", } url = "https://movie.douban.com/cinema/nowplaying/chongqing/" # 發起請求 rep = requests.get(url, headers=headers) text = rep.text # 轉換成html格式 html = etree.HTML(text) # 找到子孫節點ul標簽 ul = html.xpath("//ul[@class='lists']")[0] # 當前ul下的所有li標簽 lis = ul.xpath("./li") movies = [] # 循環每個li標簽 for li in lis: # 直接@li標簽的屬性獲取值 title = li.xpath("@data-title")[0] score = li.xpath("@data-score")[0] region = li.xpath("@data-region")[0] actors = li.xpath("@data-actors")[0] director = li.xpath("@data-director")[0] liimg = li.xpath(".//img/@src") movie = { "title": title, "score": score, "region": region, "actors": actors, "director": director, "liimg": liimg, } movies.append(movie) print(movies)
電影天堂

import requests from lxml import etree BASE_DOMAIN = "http://www.ygdy8.net" HEADERS = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3554.0 Safari/537.36", } def get_detail_urls(url): # 進入首頁 rep = requests.get(url=url, headers=HEADERS) # 小坑(編碼里面有非法字符,所以加ignore過濾掉) text = rep.content.decode("gbk", "ignore") html = etree.HTML(text) # 通過規律直接找table下的a標簽屬性 detail_urls = html.xpath("//table[@class='tbspan']//a/@href") # map接受一個函數和list,並通過匿名函數lambda依次作用在list的每個元素上,得到一個新的list並返回 detail_urls = map(lambda url:BASE_DOMAIN+url, detail_urls) # 返回拼接完成的詳情url return detail_urls def parse_detail_page(url): # 爬取詳情頁面信息 movie = {} res = requests.get(url, headers=HEADERS) text = res.content.decode("gbk") html = etree.HTML(text) title = html.xpath("//div[@class='title_all']//font[@color='#07519a']/text()")[0] movie["title"] = title zoomE = html.xpath("//div[@id='Zoom']")[0] # 獲取當前標簽下的img imgs = zoomE.xpath(".//img/@src") # 列表切片法,避免取超過范圍的數據報錯 cover = imgs[0:1] movie["cover"] = cover poster = imgs[1:2] movie["poster"] = poster infos = zoomE.xpath(".//text()") def parse_info(info, rule): # 重復操作,提取出一個函數 return info.replace(rule, "").strip() for index, info in enumerate(infos): if info.startswith("◎年 代"): text = parse_info(info, "◎年 代") movie["year"] = text elif info.startswith("◎產 地"): text = parse_info(info, "◎產 地") movie["country"] = text elif info.startswith("◎類 別"): text = parse_info(info, "◎類 別") movie["category"] = text elif info.startswith("◎豆瓣評分"): text = parse_info(info, "◎豆瓣評分") movie["douban_rating"] = text elif info.startswith("◎片 長"): text = parse_info(info, "◎片 長") movie["duration"] = text elif info.startswith("◎導 演"): text = parse_info(info, "◎導 演") movie["director"] = text elif info.startswith("◎主 演"): text = parse_info(info, "◎主 演") actors = [text] for x in range(index+1, len(infos)): actor = infos[x].strip() if actor.startswith("◎標"): break actors.append(actor) movie["actors"] = actors elif info.startswith("◎簡 介"): text = parse_info(info, "◎簡 介") for x in range(index+1, len(infos)): profile = infos[x].strip() if profile.startswith("◎獲獎情況"): break movie["profile"] = profile download_url = html.xpath("//td[@bgcolor='#fdfddf']/a/@href") movie["download_url"] = download_url return movie def spider(): base_url = "http://www.ygdy8.net/html/gndy/dyzz/list_23_{}.html" movies = [] # 設置爬取頁面數量的url for i in range(1, 180): url = base_url.format(i) # 傳遞到第一個首頁爬取詳情頁面鏈接 detail_urls = get_detail_urls(url) # 獲取待爬取頁面詳情的url for detail_url in detail_urls: # 傳遞到詳情頁面爬取並獲取爬取的詳情數據 movie = parse_detail_page(detail_url) movies.append(movie) print(movies) if __name__ == '__main__': spider()
貓眼電影

"""貓眼電影爬取""" import requests from lxml import etree BASE_URL = "http://maoyan.com" HEADERS = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3554.0 Safari/537.36" } def get_detail_urls(url): # 具體獲取詳情url rep = requests.get(url=url, headers=HEADERS) html = etree.HTML(rep.text) # 找到詳情url detail_urls = html.xpath("//dl//div[@class='movie-item']/a/@href") detail_urls = map(lambda url: BASE_URL+url, detail_urls) return detail_urls def parse_detail_page(url): # 獲取數據 movie = {} res = requests.get(url=url, headers=HEADERS) text = res.content.decode("utf-8") html = etree.HTML(text) name = html.xpath("//div[@class='movie-brief-container']/h3/text()")[0] movie["name"] = name lis = html.xpath("//div[@class='movie-brief-container']//li") for li in range(len(lis)): if li == 0: movie["plot"] = lis[li].xpath("./text()")[0] if li == 1: movie["country"] = lis[li].xpath("./text()")[0].split()[0] movie["duration"] = lis[li].xpath("./text()")[0].split()[1] if li == 2: try: movie["release_time"] = lis[li].xpath("./text()")[0] except Exception as e: continue avatar = html.xpath("//div[@class='avatar-shadow']/img/@src") movie["avatar"] = avatar content = html.xpath("//div[@class='mod-content']/span/text()")[0] movie["content"] = content container = html.xpath("//div[@class='comment-list-container']/ul") for li in container: li_name = li.xpath(".//span[@class='name']/text()") li_content = li.xpath(".//div[@class='comment-content']/text()") livs = zip(li_name, li_content) movie["user"] = dict((name, value)for name, value in livs) return movie def spider(): # 獲取url自行拼接 base_url = "http://maoyan.com/films?showType=1&offset={}" movies = [] for i in range(0, 31, 30): url = base_url.format(i) # 拿到url之后去找到詳情頁面url detail_urls = get_detail_urls(url) for detail_url in detail_urls: # 去獲取詳情頁面數據 movie = parse_detail_page(detail_url) movies.append(movie) print(movie) print(movies) if __name__ == '__main__': spider()
騰訊招聘網

"""爬取騰訊招聘網找工作""" import requests from lxml import etree HEADERS = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3554.0 Safari/537.36", "Referer": "https://hr.tencent.com/position.php?keywords=python&lid=2218&tid=87&start=0" } BASE_URL = "https://hr.tencent.com/" def get_detail_urls(url): rep = requests.get(url=url, headers=HEADERS) html = etree.HTML(rep.text) detail_urls = html.xpath("//table//td[@class='l square']/a/@href") detail_urls = map(lambda url: BASE_URL+url, detail_urls) return detail_urls def get_parse_detail(url): job_offers = {} res = requests.get(url=url, headers=HEADERS) html = etree.HTML(res.text) position = html.xpath("//table//td[@class='l2 bold size16']/text()")[0] job_offers["position"] = position tds = html.xpath("//table//tr[@class='c bottomline']/td/text()") for i in range(len(tds)): job_offers["location"] = tds[0] job_offers["category"] = tds[1] job_offers["recruits"] = tds[2] duties = html.xpath("//tr[3][contains(@class, 'c')]//li/text()") job_offers["duties"] = duties claim = html.xpath("//tr[4][contains(@class, 'c')]//li/text()") job_offers["claim"] = claim return job_offers def spider(): base_url = "https://hr.tencent.com/position.php?keywords=python&lid=2218&tid=87&start={}#a" squres = [] for i in range(0, 340, 10): url = base_url.format(i) detail_urls = get_detail_urls(url) for detail_url in detail_urls: squre = get_parse_detail(detail_url) squres.append(squre) print(squre) if __name__ == '__main__': spider()
可參考博客鏈接(我就懶得寫了):http://www.cnblogs.com/zhangxinqi/p/9210211.html#_label11