Python處理郵件內容和提取郵件里的url地址

本文轉載自查看原文 2020-12-23 21:32 689 Python爬蟲吧

最近在搞一個郵箱驗證賬號注冊和登錄的模塊。總結一下。就當記載。文章中涉及到域名和郵箱等都經過處理。

需求是這樣子的，通過selenium注冊某個網站的賬號，然后注冊需要郵件內容里的鏈接激活，登錄的時候如果不是常用設備的話也需要認證，而兩種認證方式給出的鏈接方式是不一樣的。一種是直接通過發放激活鏈接給你，就是純網址，這個好處理，直接用str或者text就能從郵件內容里將原始的url地址提取出來了。但是登錄認證的話就是一個錨文本鏈接，這個如果還用str或者text方式提取的話，就會出現一個問題，就是提取出來的url里的"&"它會變成"&"，在瀏覽器地址欄中輸入的話，就會出錯。所以用str或者text方式提取出來的，還得還原一下地址欄能用的網址。用模塊html就行了。所有問題搞定。然后將提取出來的鏈接加載到selenium里繼續愉快的xxoo就行了。

思路：1->登錄郵箱；

　　　2->獲取最新郵件(獲取最新郵件的理由是，激活和認證的同時，應該不會那么巧有其他郵件發過來，所以我覺得，這種足夠滿足需求了)；

　　　3->解析郵件內容，用imap和pop取回來的郵件我嫌太麻煩，用國人大神開發的zmail解析郵件內容只需幾個步驟。

　　　　　　首先獲取郵件的html內容

　　　　　　將html內容解析成str

　　　　　　把str內容單獨提取出來，在本地建一個html頁面

　　　　　　通過bs4解析這個本地html的內容

　　　　　　提取自己想要的內容，在這里是鏈接地址

提取激活鏈接的代碼：

 1 import zmail
 2 from bs4 import BeautifulSoup
 3 import lxml
 4 
 5 '''
 6     #一個模塊專門處理xxx新注冊用戶的郵箱驗證問題
 7     #思路：
 8         1，首先通過zmail登錄到郵箱
 9         2，因為注冊都是即時的，驗證郵件應該也是即時的
10         3，收取最新收到的郵件
11         4，獲取最新郵件的content_html內容
12         5，將content_html內容轉換成str格式方便處理
13         6，將str格式的html文檔使用bs4解析
14         7，通過bs4解析出來的內容是一個列表
15         8，讀取列表里的文本內容，這樣子也不會丟失格式
16     #坑：
17         1，郵箱里的鏈接內容如果通過直接讀取，是會更改的，比如說=就會變成amp等，所以要通過text讀取
18         
19         
20 '''
21 class GetTheVerifyLink():
22     def __init__(self, emailAccount, pwd):
23         #初始化用戶名和密碼
24         self.emailAccount = emailAccount
25         self.pwd = pwd
26 
27     def login_to_server(self):
28         '''
29             #嘗試登錄郵箱服務器
30         '''
31         try:
32             connect = zmail.server(self.emailAccount, self.pwd)
33             print("登錄成功！")
34         except:
35             print("登錄失敗！請檢查")
36         #將connect對象返回到調用它的代碼行
37         return connect
38 
39     def get_the_latest_mail(self):
40         '''
41             #通過判定最新郵件的發件人是不是有xxx來判定是不是我們要的郵件
42         '''
43         mail = self.login_to_server().get_latest()
44         #開始判定
45         sender = mail['From'] #獲取發件人
46         subject = mail['Subject'] #獲取郵件主題
47         content_text = mail['Content_text'] #獲取郵件內容
48         content_text_str = ''.join(content_text) #將郵件內容轉換為str，不轉換也行，不轉換請看下面
49         # content_text_str = ''.join(content_text[0]) #獲取列表的一個元素，將它轉換成str，因為它只有一個元素
50         if '@xxx' in sender and 'verify your e-mail address' in subject and 'https://www.xxx/xx/xxx' in content_text_str:
51             print("這是我們需要的郵件！")
52             return mail
53         else:
54             print("這不是我們需要的郵件，請登錄郵箱手動刪除並保持amazon認證郵件最新！")
55 
56     def get_the_mail_content_html(self):
57         content_html = self.get_the_latest_mail()['content_html']
58         if content_html:
59             return content_html
60         else:
61             print("獲取content_html內容郵件失敗！")
62 
63     def tranfer_content_html_to_str(self):
64         content_html_to_str = ''.join(self.get_the_mail_content_html())
65         if content_html_to_str:
66             print("將郵件的content_html轉換成str成功！")
67             return content_html_to_str
68         else:
69             print("將郵件的content_html內容轉換成str失敗！")
70 
71     def get_the_verify_link(self):
72         soup = BeautifulSoup(self.tranfer_content_html_to_str(), 'lxml')
73         verify_link_list = soup.select(r'body > p:nth-child(3) > a:nth-child(1)')
74         for verify_link in verify_link_list:
75             verify_link = verify_link.text
76         return verify_link
77 
78 # emailAccount = 'xxx@xxx.com'
79 # pwd = 'xxxx!'
80 #
81 # gt = GetTheVerifyLink(emailAccount,pwd)
82 # # latest_mail = gt.get_the_latest_mail()
83 # # print(latest_mail)
84 # verify_link = gt.get_the_verify_link()
85 # print(verify_link)

提取登錄鏈接的代碼：

 1 import zmail
 2 from bs4 import BeautifulSoup
 3 import lxml
 4 import html
 5 
 6 class xxxSignInConfirm():
 7     def __init__(self, username, password):
 8         self.username = username
 9         self.password = password
10 
11     def login_to_server(self):
12         try:
13             server = zmail.server(self.username, self.password)
14             print("E-mail Login success!")
15         except:
16             print("E-mail login Failed!")
17         return server
18 
19     def get_the_latest_mail(self):
20         '''
21             #通過判定最新郵件的發件人是不是有xxx.com來判定是不是我們要的郵件
22         '''
23         mail = self.login_to_server().get_latest()
24         #開始判定
25         sender = mail['From'] #獲取發件人
26         subject = mail['Subject'] #獲取郵件主題
27         content_text = mail['Content_text'] #獲取郵件內容，獲取文本的郵件內容主要是用在接下來的if語句里的
28         content_text_str = ''.join(content_text) #將郵件內容轉換為str，不轉換也行，不轉換請看下面
29         # content_text_str = ''.join(content_text[0]) #獲取列表的一個元素，將它轉換成str，因為它只有一個元素
30         if 'security' in subject:
31             print("這是我們需要的郵件！")
32             return mail
33         else:
34             print("這不是我們需要的郵件，請登錄郵箱手動刪除並保持amazon認證郵件最新！")
35 
36     def get_the_mail_content_html(self):
37         mail = self.get_the_latest_mail()
38         content_html = mail['Content_html']
39         if content_html:
40             return content_html
41         else:
42             print("獲取content_html內容郵件失敗！")
43 
44     def transfer_content_html_to_str(self):
45         content_html_to_str = ''.join(self.get_the_mail_content_html())
46         if content_html_to_str:
47             print("將郵件的content_html轉換成str成功！")
48             return content_html_to_str
49         else:
50             print("將郵件的content_html內容轉換成str失敗！")
51 
52     def get_the_sign_in_verify_link(self):
53         soup = BeautifulSoup(self.transfer_content_html_to_str(), 'lxml') #利用bs4解析html的郵件內容
54         signInconfirmLinks = soup.select(r'body > div:nth-child(3) > div:nth-child(4) > table:nth-child(2) > tbody:nth-child(1) > tr:nth-child(1) > td:nth-child(1) > table:nth-child(1) > tbody:nth-child(1) > tr:nth-child(5) > td:nth-child(1) > a:nth-child(1)')
55         signlinkelement = signInconfirmLinks[0]
56         signlinkstr = str(signlinkelement) #轉換成str
57         # print(signlinkstr)
58         afind = signlinkstr.find('"') #查找字符串中<a href="https//xxxxxxxx.com/"</a>的第一個引號
59         # print(afind)
60         bfind = signlinkstr[afind + 1:].find('"') #找出第二個引號的位置
61         # print(bfind)
62         signlinkamp = signlinkstr[afind + 1: afind + 1 + bfind] #在兩個引號之間就是我們需要的鏈接地址，切片切出來
63         # print(signlinkamp) #打印一下提取出來的地址，是我們想要的，只是&變成了&amp;
64         verify_link = html.unescape(signlinkamp) #使用html還原真實網址
65         return verify_link
66 
67 username = 'xxxx@xxx.com'
68 pwd = 'xxxx'
69 ac = AmazonSignInConfirm(username, pwd) #實例化
70 
71 
72 verify_link = ac.get_the_sign_in_verify_link()
73 print(verify_link) #打印，符合預期

兩段代碼基本上是相同的，除了最后一個類方法處理方式有點不同的話。文中有些合法性驗證是不需要的，我是為了方便調試和查看程序運行軌跡才加那么多的。記錄是記錄，也希望有志同道合的人，給出更完美的解決方案。

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 java如何提取url里的域名 python 日志內容提取提取Word里的文本內容 C# url 地址處理（截取，參數等）正則表達式—從HTML里提取內容 Python處理URL編碼 python 如何處理url的中文【python】使用python發送文本內容郵件【Python系列】Python自動發郵件腳本-html郵件內容 url地址傳參中文亂碼處理