(一)正則表達式
(二)正則表達式語法:
經典例子:
IP地址:
(三)常用方法:
1,第一個方法:re.search()返回match對象
2,第二個方法:re.match():當匹配的第一個字符不符合,則返回空,返回match對象
3,第三個方法:re.findall()
4,第四個方法:re.split()
5,第五個方法:re.finditer()返回match對象
6,第六個方法:re.sub()
7,第七個方法:re.compile()
8,兩種使用re對象的方法的方式:使用面向對象方式時,將正則表達式寫入compile()方法中,re對象的方法中就不用正則表達式參數了;
(四)Match對象
1,match對象的屬性:
2,match方法:
(五)正則表達式的匹配
1,貪婪匹配
2,最小匹配
例子:
import re rE=re.compile(r"[A-Z].*[A-Z]") ls=rE.search("adaAdssdDsdsFdsdsdM") print(ls.group(0))
截圖:
import re rE=re.compile(r"[A-Z].*?[A-Z]") ls=rE.search("adaAdssdDsdsFdsdsdM") print(ls.group(0))
截圖:

(六)re庫的使用例子
實例一:
1,功能:
2,實現難點:
3,准備工作:
接口:https://s.taobao.com/search?q="搜索的關鍵詞"
分頁:第1頁:https://s.taobao.com/search?q="搜索的關鍵詞"xxxxxxxxxxxs=0
第2頁:https://s.taobao.com/search?q="搜索的關鍵詞"xxxxxxxxxxxs=44
第3頁:https://s.taobao.com/search?q="搜索的關鍵詞"xxxxxxxxxxxs=88
商品名稱:"raw_title":"uek小學生書包男孩女生1-3-6年級護脊雙肩背包6-12歲輕便兒童書包"(存入腳本)
商品價格:"view_price":"198.00"(腳本中)
獲取網頁請求頭的信息方式:
4,實現步驟:
import requests import re #步驟1獲取商品頁面信息 def getHTMLText(url): kv = {'Cookie':'thw=cn; t=cce1b71dee0c6103ee00c2a80f5d58b8; cookie2=1729efb908bb47c9173e7285b76bcf1f; _tb_token_=37535f9e637ee; _samesite_flag_=true; enc=0MctOlChFlCV5nZPM9I%2BvVj4iB8%2BJrpziCIjwoMf9H8E9LNIgKTgxu8%2BCSUFi9bSBiSB%2FDa5a9LfUco5h2f5%2Bg%3D%3D; JSESSIONID=24C312D5C90EBF0B9899868DCFB187FD; hng=CN%7Czh-CN%7CCNY%7C156; mt=ci=0_0; alitrackid=www.taobao.com; lastalitrackid=www.taobao.com; _uab_collina=158054440702373535211543; isg=BMXFMTj6N3FEpBO8M49bhYzg1wH_gnkUQBjla8cqiPwLXuXQj9O75js8aAKoBZHM; l=cBLFG56PQxE7VxgtBOCw5uI8Lz7tjIOYYuPRwCDMi_5Qw6Ls55bOo4N2tFp6csWdOPTB4eJvIYp9-etkiKy06Pt-g3fP.; cna=7ZWXFhLSeHECAd3As7NcDDt3; v=0; miid=664408812090938164; tk_trace=oTRxOWSBNwn9dPyorMJE%2FoPdY8zfvmw%2Fq5hp3RebXyJYzZUKAW1r6Uz%2FmMSvZbsMK3FmmZCXmyWRkMO1h3bnCN3x3ZrRJ7yYJdsRNoxMHGQpCyrQg3UzMo56H4tZmIQynT2CEisdNFijmGtI6hnUSL2LXCi4gKgBkS8Rgapd4MwIAFja5Js0G49JqnDhbWc1sKw79UmzqcVt7Tw0az0KP0yXz%2B0Li96D%2FooNJq9RYuD5ymYWzhG%2FmPLfTyzOOMEzkaw7usLWvRNcuQeUOw8Aji5KPbwH; x5sec=7b227365617263686170703b32223a223931346262326434363731346639336664383162376333356535323062363562434b377431664546454d4c306c666d38785065696f51456144544d304d5445784d4459344d546b374d54413d227d', 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0','Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8','Accept-Encoding':'gzip, deflate, br','Accept-Language':'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2','Connection':'keep-alive','Host':'s.taobao.com','Referer':'https://s.taobao.com/search/_____tmd_____/verify/?nc_token=95d15a040668aef6de1945952b6d3d6f&nc_session_id=01Zi4sj2c1tpzARQSsI0rq4Cgb5Sg381G1L90GIrfuH6mT-4CLxi8J5_8kfLz3jmo6N8HPWMx6QlMWV-Nw9LKhFln6-0AXmw2z9nEi93BnEERLHjVyYXuOUK6PXRbErcqBWuWG0mV-a-DvE0pTXq6UJzeagecKV3_fbyDJFmMMda9Gn-gwOELXRwI79faMGTxFHvJG0DpdSHZOdyTDPcKc6A&nc_sig=05GpvesdJBVOhMKscmy0iTGVDhDcartZBI8_OhtG395T3MbvF8jsEiyBZ7_89OfHOu1nVfPaE5fquc_8txuX6MuF8JQQQg6KQjDn6GhIUSV4fA9h6a7L0JgEFy3pW-zi6nL8tX3gUMBIC4YgQfgM20A7aweY_FNSUFEk-nEfY_Qwzkpdmguqt_-uH0uG2xqNmbhzIiGJzKxLpS1LnjuS6F8jnEPs_3D0-lJiwaplLuVDZq1_zEMGxVwW_8z-7wUDFCWI9DTT7KPOs-J3qa5Yebuw4i27GuhH6ngF7lLWFCfZj6_1L4lvqAx25LM2Tin_MGdXxH5yNP9qr_zDPI2kztLEB0VESqjnwGcZ4Ovek3EkEo2o3GOxbowBqfBsLfhdzrp7i1nHL_dhh2gabB_Lino1sAxQHojeOF39MSL4bxBD5Ffwge4v4hLjRYG86tawsBHxVCueuVDLHjdNsABRbmWC-ezyLl12sTvnDX542Pzls&x5secdata=5e0c8e1365474455070961b803bd560607b52cabf5960afff39b64ce58073f78f68ede033dd239842063c29628191423866f9620b863c667132a90ce579d5cd75c7327bde2bcde85def97069291fa34dbdd96b66a9dff8da5f07d2fd13ea072445cb0ce36cef9f62cbd52852a03cf8ba461ee819ca12264cfd380e1ff9a3181721bc44fcd5aa925118d721cd93646f3e566899a389acd33f04add41433f5dd657f7806228d1f17d85334904897a5fa2e79a52a883b8d79c21b1904b01749f64ff68ede033dd239842063c29628191423f26a33fc19185ca7f5ba435d1801cd576b8357c40b8852e10bee2dd322fdfa01b85d13ca384528f05b373d3a77a70575ad921bb1d36afdc5973c0455682491a957f7918a4f2572499cc398910575bb4ae5b2a48d9c0185c8d8521d59b4860b9243a2952e026506275152d2dce642e18a4440bf0b3e57db00024c36b841c1cc35ed81c65bebf3b9df46dd6afed6f199892c38573d94a1e033206e485398b2371f6578a595e91f44da415c8660f5f2584e2dcd04435273e80c8ba41a4b44ca79f946d7c07b418bd61930fcc7f43085f215602ff14c1eeffa993259bf8351d819eb3f4129c5e95a897ae925e3fabb3e1a8cfc76271052aec7cfa7d67310728cc6e8&x5step=100&nc_app_key=X82Y__2ec484ab2f20befbd6f0aadd26b8bc5b' ,'TE':'Trailers','Upgrade-Insecure-Requests':'1'} #設置訪問的請求頭,否則會被反爬蟲限制登錄或驗證等 try: r = requests.get(url, headers=kv,timeout=30) r.raise_for_status() r.encoding = r.apparent_encoding return r.text except: return "" #步驟2分析頁面內容 def parsePage(ilt, html): try: rEo=re.compile(r'\"view_price\"\:\"[\d\.]*\"') plt=rEo.findall(html) #價格 rEt=re.compile(r'\"raw_title\"\:\".*?\"') tlt=rEt.findall(html) #商品名稱 for i in range(len(plt)): #eval()方法去掉雙引號,將內容通過:分割 price = eval(plt[i].split(':')[1]) title = eval(tlt[i].split(':')[1]) ilt.append([price, title]) except: print("") #步驟3打印商品信息 def printGoodsList(ilt): tplt = "{0:^4}\t{1:^8}\t{2:{3}^25}" print(tplt.format("序號", "價格", "商品名稱",chr(12288))) print("-"*90) count = 0 for g in ilt: count = count + 1 print(tplt.format(count, g[0], g[1],chr(12288))) print("-"*90) #主函數 def main(): goods = '書包' #搜索商品關鍵詞 depth = 3 #設置爬取頁面數 start_url = 'https://s.taobao.com/search?q=' + goods infoList = [] for i in range(depth): try: url = start_url + '&s=' + str(44 * i) #第i+1頁的URL html = getHTMLText(url) parsePage(infoList, html) except: continue #當某一頁爬取錯誤,則結束繼續爬取下一頁, printGoodsList(infoList) #程序入口 if __name__=="__main__": main()
截圖:
實例二:
1,功能:
2,實現難點:
3,准備工作:網站選取原則:
爬取鏈接:https://quote.stockstar.com/stock/sha_3_1_x.html 股票綜合排名(x從1到53頁)
http://q.stock.sohu.com/cn/xxxxxx/index.shtml 搜狐個股股票信息
先爬取股票綜合排名:獲取股票代碼,放入搜狐股票的鏈接中轉到個股信息
爬取股票綜合排名網頁發現:股票簡略信息都在<tbody class="tbody_right" id="datalist"></tbody>中;一個行內是一支股票的信息;一行的第一列是代碼,第二列 是股票名稱
4,步驟:
import requests; import re import bs4 import traceback from bs4 import BeautifulSoup from idlelib.iomenu import encoding #獲取HTML內容 def getHTMLText(url): access={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:72.0) Gecko/20100101 Firefox/72.0"} try: r=requests.get(url,headers=access) r.raise_for_status() r.encoding=r.apparent_encoding return r.text except: return "無法連接" #獲取股票列表 和個股的URL def getStockList(lst,stockURL): try: page=54 #存放股票列表的一共53頁 for i in range(1,page): list_url=stockURL+str(i)+".html" #每一頁的URL print(list_url) html=getHTMLText(list_url) #每一頁的HTML內容 soup=BeautifulSoup(html,"html.parser") tbody=soup.find(id="datalist") #查找有股票內容的tbody for tr in tbody.children: if isinstance(tr, bs4.element.Tag): #排除非標簽的子節點 td=tr('td') #每一行的td列表 code=td[0].string #每個股票代碼 stockName=td[1].string #每個股票名稱 lst.append(code) #將每個股票代碼存入列表lst except: print("獲取錯誤") #獲得個股股票信息,並存入列表,把信息存入本地文件 def getStockInfo(lst,stockURL,fpath): cond=1 #顯示進度變量,初始值為1 for code in lst: url=stockURL+code+"/gp" try: print(url) html=getHTMLText(url) infoDict={} soup=BeautifulSoup(html,"html.parser") div2=soup.find(name="div",attrs={"class":"content clear"}) #里面是基本信息 div1=soup.find(name="div",attrs={"class":"title_bg"}) #里面是名稱和代碼 messaget=div1.find_all(name="h1") #里面是名稱和代碼 message=div2.find_all(name="span") #里面是基本信息 stock_name=messaget[0].string #股票名稱 stock_code=messaget[1].string #股票代碼 span1=message[::2] #span1為message列表的偶數位數據組成的列表 span2=message[1::2] #span2為message列表的奇數位數據組成的列表 stock_price=span1[0].string #股票價格 stock_change_point=span1[1].string #股票變化點 infoDict.update({"股票名稱":stock_name,"股票代碼":stock_code,"今日股票價格":stock_price,"今日變化點數":stock_change_point}) key=[] val=[] for i in span1[2:15]: rE=r"[\[a-z]{0,2}\d{0,4}]" agei=re.sub(rE,"",i.string) agei="".join(agei.split()) key.append(agei) #key中存放股票漢字提示信息 for j in span2[1:15]: #val中存放股票數據信息 val.append(j.string) key.insert(0, "漲跌幅") print(key[1]) for i in range(len(key)): kv=key[i] va=val[i] infoDict[kv]=va #新增內容 with open(fpath,'a',encoding="utf-8") as f: f.write(str(infoDict)+"\n") cond+=1 print("\r當前速度:{:.2f}%".format(cont*100/len(lst)),end="") #\r被禁用 except: cond+=1 print("\r當前速度:{:.2f}%".format(cont*100/len(lst)),end="") print("執行錯誤") traceback.print_exc() #continue #主函數 def main(): stock_list_url="https://quote.stockstar.com/stock/sha_3_1_" lst=[] stock_info_url="http://gu.qq.com/" out_put_file="D://stock.txt" getStockList(lst, stock_list_url) getStockInfo(lst,stock_info_url, out_put_file) #程序入口 main()
截圖:
5,不足:爬取股票列表和個股網頁部分沒有爬取到,爬取速度慢。
只能爬取HTML中的靜態數據,不能爬取動態數據