python通過pop3方式登錄郵箱(qq,新浪,網易)


python內置模塊登錄郵箱(SMTP/POP3)

使用pdfminer解析pdf合同附件,簡單的提取一下里面的內容

壓縮包已經上傳!

poplib_emain.py內容(完整版):

 

# coding:utf-8
# date:2018/4/19
# PDFParser : pdf解析類
# PDFDocument : pdf 文本存儲
# PDFResourceManager : pdf 存儲資源類(圖片文本)
# PDFPageInterpreter : pdf 處理頁面內容將PDFDevice翻譯成想要的內容
from email.parser import Parser
from email.header import decode_header
from email.utils import parseaddr
import re
import poplib
import traceback
import mysql.connector
from pdfminer.pdfparser import PDFDocument, PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, PDFTextExtractionNotAllowed
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LTTextBoxHorizontal, LAParams
from emails.settings import *
from emails.Logger import *

logger = Logger(logname='log5.txt', loglevel=1, logger='email').getlog()


def deco(func):
def wrapper(self, *args, **kwargs):
try:
return func(self, *args, **kwargs)
except Exception as e:
logger.error('\n%s\n方法 %s發生錯誤,原因是: %s\n' % ('-' * 100, func.__name__, traceback.format_exc()))

return wrapper


class Email:
name_lis = []
money_lis = []
mingxi_lis = []

def __init__(self):
self.db = mysql.connector.Connect(host=HOST, user=USER, password=PASSWORD, port=PORT, db=DBS)
self.cursor = self.db.cursor()

@deco
def pdf_text(self, path):
'''
pdf解析文本,保存到列表
:param path: pdf位置參數
:return: 文本列表
'''
content_ = []
fp = open(path, 'rb')
# 創建一個pdf分析對象
parse = PDFParser(fp)
# 創建一個pdf文檔對象
document = PDFDocument(parse)
# 連接分析器和文檔獨享
parse.set_document(document)
document.set_parser(parse)
# 提供初始化密碼
# 如果沒有密碼 就創建一個空的字符串
document.initialize()
# 檢測文檔是否提供txt轉換,不提供就忽略
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
# 創建一個資源共享管理對象
rsrcmgr = PDFResourceManager()
# 創建一個PDF設備對象
laparams = LAParams()
# 創建一個PDF解釋器對象。
device = PDFPageAggregator(rsrcmgr, laparams=laparams)

# 處理文檔中包含的每個頁面。
interpreter = PDFPageInterpreter(rsrcmgr, device)
lens = []
for i in document.get_pages():
lens.append(i)
obj = lens.pop()

for page in document.get_pages():
# 使用頁面解析器來讀取
interpreter.process_page(page)
# 使用聚合器獲取內容
layout = device.get_result()
for x in layout:
if isinstance(x, LTTextBoxHorizontal):
content_.append(x.get_text())
return content_

def turn_on_off(self, lines, status, start, end):
'''
檢測該行開始是否是特定值開始/結束
如果是設定特定值開始/結束,狀態開/關
'''
if lines.startswith(start):
status = True
elif status:
if lines == end:
status = False
return status

def savefile(self, filename, data, path):
try:
filepath = path + filename
print('Save as: ' + filepath)
f = open(filepath, 'wb')
except:
print(filepath + ' open failed')
# f.close()
else:
f.write(data)
f.close()

def decode_str(self, s):
'''
轉碼
:param s:
:return:
'''
value, charset = decode_header(s)[0]
if charset:
value = value.decode(charset)
return value

def guess_charset(self, msg):
'''
# 先從msg對象獲取編碼:
:param msg:
:return:
'''
charset = msg.get_charset()
if charset is None:
# 如果獲取不到,再從Content-Type字段獲取:
content_type = msg.get('Content-Type', '').lower()
pos = content_type.find('charset=')
if pos >= 0:
charset = content_type[pos + 8:].strip()
return charset

@deco
def print_info(self, msg, mypath):
'''
解析郵件
:param msg: 數據對象
:param mypath: pdf文本保存路徑
:return: 文本字典,於pdf提取結果
'''
headers = {}
# 郵件的From, To, Subject存在於根對象上:
for header in ['From', 'To', 'Subject', 'Date']:
value = msg.get(header, '')
if value:
if header == 'Date':
headers['date'] = value
if header == 'Subject':
# 需要解碼Subject字符串:
value = self.decode_str(value)
headers['Subject'] = value
else:
# 需要解碼Email地址:
hdr, addr = parseaddr(value)
name = self.decode_str(hdr)
value = u'%s <%s>' % (name, addr)
if header == 'From':
from_address = value
headers['from'] = from_address
if header == 'To':
to_address = value
headers['to'] = to_address
# headers['date']=''
else:
headers['date'] = ''
for part in msg.walk():
filename = part.get_filename()
content_type = part.get_content_type()
charset = self.guess_charset(part)
if filename:
filename = self.decode_str(filename)
data = part.get_payload(decode=True)
if filename != None or filename != '':
if '.pdf' in filename:
print('Accessory: ' + filename)
headers['url'] = mypath + filename
self.savefile(filename, data, mypath)
with open('pdf.txt', 'wb')as f:
f.write(''.join(self.pdf_text(mypath + filename)).encode('utf-8'))
openfile = open('pdf.txt', 'r', encoding='utf-8')
names = money = mingxi = False
for line in openfile:
if names:
self.name_lis.append(line.strip())
# print(line.strip())
names = self.turn_on_off(line, names, '甲方(借款人):', '鑒於:\n')
if money:
self.money_lis.append(line.strip())
# print(line, )
money = self.turn_on_off(line, money, '第一條 借款金額、期限及利息、借款類型、借款提現', '第二條 還款\n')
if mingxi:
self.mingxi_lis.append(line.strip())
# print(line.strip())
mingxi = self.turn_on_off(line, mingxi, '2.4 還款計划明細\n',
'甲方還款日如發生變化的,甲方同意服務方以錄音電話或電子郵件並輔助以短信的方\n')
headers['names'] = ''.join(self.name_lis)
headers['money'] = ''.join(self.money_lis)
headers['mingxi'] = ''.join(self.mingxi_lis)

else:
headers['names'] = ''
headers['money'] = ''
headers['mingxi'] = ''
headers['url'] = ''
email_content_type = ''
content = ''
if content_type == 'text/plain':
email_content_type = 'text'
if content_type == 'text/html':
email_content_type = 'html'
if charset:
try:
content = part.get_payload(decode=True).decode(charset)
except:
content = '英文'
headers['contents'] = ','.join(re.findall(u'[\u4e00-\u9fa5]+', content)).replace('宋體', '')
# print(headers)
return headers

@deco
def save_file(self):
email = str(input('請輸入賬號:'))
password = str(input('請輸入授權碼:'))
if '@163.' in email:
pop3_server = 'pop3.163.com'
elif '@126.' in email:
pop3_server = 'pop3.126.com' # 網易授權服務器地址
elif '@qq.com' in email: # 騰訊
pop3_server = 'pop.qq.com'
elif '@sina.com' or '@2008.sina' or '@51uc.com' in email: # 新浪
pop3_server = 'pop.sina.com'
elif '@188.com' in email:
pop3_server = 'pop3.188.com'
else:
logger.info('不支持此郵箱%s,或程序出錯' % email)
print('暫不支持此郵箱')
exit()

mypath = PATH

# 連接到POP3服務器:
print(pop3_server)
logger.info('傳入的賬號是%s' % email)
server = poplib.POP3_SSL(pop3_server)

# 可以打開或關閉調試信息:
server.set_debuglevel(1)

# 可選:打印POP3服務器的歡迎文字:
print(server.getwelcome().decode('utf-8'))

# 身份認證:
server.user(email)
server.pass_(password)
try:
sql = "create table if not exists email_sina(username char(255) not null,froms char(255) not null,tos char(255) not null,subject char(255) not null primary key,dates char(255),fu_url char(255),contents char(255),names varchar(1000),money varchar(1000),detail varchar(1000))"
self.cursor.execute(sql)
self.db.commit()
print('創建email_sina表')
except Exception as e:
print('%s表已經存在了email_sina')

# stat()返回郵件數量和占用空間:
# print('Messages: %s. Size: %s' % server.stat())
# list()返回所有郵件的編號:
resp, mails, octets = server.list()

# 獲取最新一封郵件, 注意索引號從1開始:
index = len(mails)

for i in range(1, index + 1):
resp, lines, octets = server.retr(i)
# lines存儲了郵件的原始文本的每一行,
# 可以獲得整個郵件的原始文本:
try:
msg_content = b'\r\n'.join(lines).decode('utf-8')
except:
continue
# 稍后解析出郵件:
msg = Parser().parsestr(msg_content)
dic = self.print_info(msg, mypath)
# print(dic)
try:
self.cursor.execute("replace into email_sina(`username`,`froms`,`tos`,`subject`,`dates`,`fu_url`,`contents`,`names`,`money`,`detail`) "
"values('"+email+"','"+dic['from']+"','"+dic['to']+"','"+dic['Subject']+"','"+dic['date']+"','"+dic['url']+"','"+dic['contents']+"','"+dic['names']+"','"+dic['money']+"','"+dic['mingxi']+"')") #ON DUPLICATE KEY UPDATE from=values('"+dic['from']+"'),to=values('"+dic['to']+"'),subject=values('"+dic['Subject']+"'),date=values('"+dic['date']+"'),fu_url=values('"+dic['url']+"'),contents=values('"+dic['contents']+"')")
self.db.commit()
print(dic['Subject'],'插入')
except Exception as e:
print(e)
self.db.rollback()
continue

server.close()
logger.info('賬號解析完成')
self.db.close()


if __name__ == '__main__':
Email().save_file()

 

 
        


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM