抓取美团商家信息
1 import requests 2 from bs4 import BeautfulSoup 3 import json 4 5 #地区路径 6 url = 'http://km.meituan.com/' 7 8 url_shop = 'http://km.meituan.com/shop/{}' 9 10 #请求头 11 headers = { 12 13 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 14 'Accept-Encoding':'gzip, deflate, sdch', 15 'Accept-Language':'zh-CN,zh;q=0.8', 16 'Cache-Control':'max-age=0', 17 'DNT':'1', 18 'Host':'bj.meituan.com', 19 'Proxy-Connection':'keep-alive', 20 'Referer':'http://bj.meituan.com/shop/286725?acm=UwunyailsW15518532529028663069.286725.1&mtt=1.index%2Fdefault%2Fpoi.pz.1.j4cijrmg&cks=58899', 21 'Upgrade-Insecure-Requests':'1', 22 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36' 23 24 } 25 26 #得到所有的二级菜单头 27 def get_start_menu_links(): 28 html = requests.get(url).text 29 soup = BeautifulSoup(html, 'lxml') 30 links = [link.find('div').find('div').find('dl').find('dt').find('a')['href'] for link in soup.find_all('div',class_='J-nav-item') ] 31 return links 32 33 34 def get_shop_ids(url, headers=None): 35 html = requests.get(url, headers=headers).text 36 soup = BeautifulSoup(html, 'lxml') 37 content_id = json.loads(soup.find('div', class_='J-scrollloader cf J-hub')['data-async-params']) 38 return json.loads(content_id.get('data')).get('poiidList') 39 40 41 def main(): 42 start_menu_links = get_start_menu_links() 43 for link in start_menu_links: 44 for pageNum in range(4,5): 45 category_url = link + '/all/page{}'.format(pageNum) 46 for shop_id in get_shop_ids(category_url, headers=headers): 47 html = requests.get(url_shop.format(shop_id), headers=headers).text 48 soup = BeautifulSoup(html, 'lxml') 49 shop_detail = soup.find('div', class_='summary biz-box fs-section cf') 50 print("==================================pageNum %d shop_id: %d===================================================" % (pageNum,shop_id )) 51 try: 52 shop_detail.find('div', class_='fs-section__left').find('h2').find('span').text 53 except: 54 continue 55 print("名称: " + shop_detail.find('div', class_='fs-section__left').find('h2').find('span').text) 56 print("地址: " + shop_detail.find('div', class_='fs-section__left').find('p', class_='under-title').find('span').text) 57 print("联系方式: " + shop_detail.find('div', class_='fs-section__left').find('p', class_='under-title').find_next_sibling().text) 58 59 60 if '__main__' == __name__: 61 main() 62