總結常用的功能小實例,快速學習並掌握python技能
1.墨跡天氣
import requests from lxml.html import etree import json import time # 導入模塊 class MoJiWeather(): def city_name(self): # 定義一個輸入城市名稱的函數 cityname = str(input("輸入城市名稱:")) return cityname def search_city(city_name):# 搜索這個城市 index_url = "http://tianqi.moji.com/api/citysearch/%s"%city_name # 構造查詢相應城市天氣的url response = requests.get(index_url) response.encoding = "utf-8" try:# 異常捕獲 city_id = json.loads(response.text).get('city_list')[0].get('cityId')# 通過上面的url獲取城市的id city_url = "http://tianqi.moji.com/api/redirect/%s"%str(city_id) # 通過城市id獲取城市天氣 return city_url except: print('城市名輸入錯誤') exit() def parse(city_url):# 解析函數 response = requests.get(city_url) response.encoding = 'utf-8' html = etree.HTML(response.text) current_city = html.xpath("//div[@class='search_default']/em/text()")[0]# 下面都是利用xpath解析的 print('當前城市:'+current_city) current_kongqi = html.xpath("//div[@class='left']/div[@class='wea_alert clearfix']/ul/li/a/em/text()")[0] print('空氣質量:'+current_kongqi) current_wendu = html.xpath("//div[@class='left']/div[@class='wea_weather clearfix']/em/text()")[0] print('當前溫度:'+current_wendu+'℃') current_weather = html.xpath("//div[@class='wea_weather clearfix']/b/text()")[0] print('天氣狀況:' + current_weather) current_shidu = html.xpath("//div[@class='left']/div[@class='wea_about clearfix']/span/text()")[0] print('當前濕度:'+current_shidu) current_fengji = html.xpath("//div[@class='left']/div[@class='wea_about clearfix']/em/text()")[0] print('當前風速:'+current_fengji) jingdian = html.xpath("//div[@class='right']/div[@class='near'][2]/div[@class='item clearfix']/ul/li/a/text()") print('附近景點:') for j in jingdian: print('\t\t'+j) if __name__ == '__main__': print("歡迎使用墨跡天氣查詢系統") city_name = MoJiWeather.city_name(1) city_url = MoJiWeather.search_city(city_name) MoJiWeather.parse(city_url) print("謝謝使用本查詢系統") input("按任意鍵退出...")
2.Tiobe排行榜
import json from lxml import etree from lxml.etree import ParseError import requests from requests.exceptions import RequestException ''' lxml實例應用 ''' ''' 獲取頁面數據 ''' def one_to_page(url): headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36' } try: res = requests.get(url,headers=headers) body = res.text #獲取網頁內容 except RequestException as e: print('request is error',e) try: html = etree.HTML(body,etree.HTMLParser()) # tr 下的所有子孫節點(只獲取文本數,圖片資源不獲取) result = html.xpath('//table[contains(@class,"table-top20")]/tbody/tr//text()') pos = 0 for i in range(20): if i == 0: yield result[i:5] else: yield result[pos:pos+5] pos += 5 except ParseError as e: print(e.position) ''' 寫入文件 ''' def write_file(data): for item in data: sul = { '2018年6月排行': item[0], '2017年6排行': item[1], '開發語言': item[2], '評級': item[3], '變化率': item[4] } # with 更好處理異常情況,進行文件的關閉后續工作 with open('test.txt','a',encoding='utf-8') as f: f.write(json.dumps(sul,ensure_ascii=False)+'\n') f.close() print(sul) return None ''' 主程序 ''' def main(): url = 'https://www.tiobe.com/tiobe-index/' data = one_to_page(url) ret = write_file(data) if ret == None: print('ok') if __name__ == '__main__': main()
3.新聞列表
''' 墨跡天氣文章爬蟲 ''' import requests import json from lxml.html import etree from lxml.etree import ParseError ''' 解析頁面內容 ''' def parseHtml(content): try: html = etree.HTML(content,etree.HTMLParser()) # one = html.xpath('//ul[@class="advisory_list_item"]//text()') one = html.xpath('//ul[@class="advisory_list_item"]//li/a/@href') print(one) exit(0) LOOP = 8 pos = 0 for i in range(20): if i == 0: yield one[0:LOOP] else: yield one[pos:pos+LOOP] pos += LOOP except ParseError as e: print(e.position) ''' 寫入文件 ''' def write_log(data): for item in data: msg = { '發文時間':item[3], '文章標題':item[5] } with open('moji.log','a',encoding='utf-8') as f: f.write(json.dumps(msg,ensure_ascii=False)+'\n') f.close() print(msg) return None ''' 主程序 ''' def main(): for page in range(1,73): url = 'https://tianqi.moji.com/news/list/moji/{}'.format(page) res = requests.get(url) res.encoding = 'utf-8' content = parseHtml(res.text) ret = write_log(content) if ret is None: print('ok') if __name__ == '__main__': main()
4.爬取IP
import requests import re import random from bs4 import BeautifulSoup ua_list = [ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36", "Mozilla / 5.0(Windows NT 6.1;WOW64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 45.0.2454.101Safari / 537.36" ] def ip_parse_xici(page): """ :param page: 采集的頁數 :return: """ ip_list = [] for pg in range(1, int(page)): url = 'http://www.xicidaili.com/nn/' + str(pg) user_agent = random.choice(ua_list) my_headers = { 'Accept': 'text/html, application/xhtml+xml, application/xml;', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Referer': 'http: // www.xicidaili.com/nn', 'User-Agent': user_agent } try: r = requests.get(url, headers=my_headers) soup = BeautifulSoup(r.text, 'html.parser') except requests.exceptions.ConnectionError: print('ConnectionError') else: data = soup.find_all('td') # 定義IP和端口Pattern規則 ip_compile = re.compile(r'<td>(\d+\.\d+\.\d+\.\d+)</td>') # 匹配IP port_compile = re.compile(r'<td>(\d+)</td>') # 匹配端口 ips = re.findall(ip_compile, str(data)) # 獲取所有IP ports = re.findall(port_compile, str(data)) # 獲取所有端口 check_api = "http://ip.taobao.com/service/getIpInfo2.php?ip=" for i in range(len(ips)): if i < len(ips): ip = ips[i] api = check_api + ip api_headers = { 'User-Agent': user_agent } try: response = requests.get(url=api, headers=api_headers, timeout=2) print("ip:%s 可用" % ip) except Exception as e: print("此ip %s 已失效:%s" % (ip, e)) del ips[i] del ports[i] ips_usable = ips ip_list += [':'.join(n) for n in zip(ips_usable, ports)] # 列表生成式 print('第{}頁ip采集完成'.format(pg)) print(ip_list) if __name__ == '__main__': xici_pg = input("請輸入需要采集的頁數:") ip_parse_xici(page=xici_pg)