詞牌名收集

原網頁形式

數據收集

import requests from bs4 import BeautifulSoup from lxml import etree headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}#創建頭部信息
cipai=[] for i in range(1,7): url='https://www.xungushici.com/cipais/p'+str(i) r=requests.get(url,headers=headers) content=r.content.decode('utf-8') soup = BeautifulSoup(content, 'html.parser') hed=soup.find('ul',class_='list-unstyled d-flex flex-row flex-wrap align-items-center w-100') list=hed.find_all('li',class_="m-1 badge badge-light") for it in list: if it.a!=None: cipai.append(it.a.text) import xlwt xl = xlwt.Workbook() # 調用對象的add_sheet方法
sheet1 = xl.add_sheet('sheet1', cell_overwrite_ok=True) sheet1.write(0,0,"title") for i in range(0,len(cipai)): sheet1.write(i+1,0,cipai[i]) xl.save("cipai_name.xlsx")

存儲形式

詩人合稱

原數據網頁

數據收集

import requests from bs4 import BeautifulSoup from lxml import etree headers = {'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'}#創建頭部信息
 hc=[] url='https://www.xungushici.com/authors' r=requests.get(url,headers=headers) content=r.content.decode('utf-8') soup = BeautifulSoup(content, 'html.parser') orign_href='https://www.xungushici.com' hecheng=soup.find('div',id='divHeCheng') list=hecheng.find_all('li',class_="m-1 badge badge-light") dic={} for i in range(1,len(list)): href=orign_href+list[i].a['href'] hecehng=list[i].a.text hc.append(hecehng) r2 = requests.get(href, headers=headers) content2 = r2.content.decode('utf-8') soup2 = BeautifulSoup(content2, 'html.parser') pomdiv=soup2.find('div',class_='col col-sm-12 col-lg-9') card=pomdiv.find_all('div',class_='card mt-3') author_list=[] for it in card: h4=it.find('h4',class_='card-title') list_a=h4.find_all('a') desty=list_a[0].text author=list_a[1].text author_list.append(author) dic[hecehng]=",".join(author_list) import xlwt xl = xlwt.Workbook() # 調用對象的add_sheet方法
sheet1 = xl.add_sheet('sheet1', cell_overwrite_ok=True) sheet1.write(0,0,"hc") sheet1.write(0,1,'author') for i in range(0,len(hc)): sheet1.write(i+1,0,hc[i]) sheet1.write(i+1,1,dic[hc[i]]) xl.save("common_name.xlsx") for it in hc: print(it+": "+dic[it])

存儲形式

之后將讀取該表，對應到詩人表中添加一列合稱屬性

詩詞形式

形式分類

按照一句話中詩詞的個數分為：五言，七言

按照詩詞的句子，每首四局為絕句，每首八句為律詩。絕句分為：五言絕句和七言絕句；律詩分為：五言律詩和七言律詩

數據處理

新學到一個表格追加使用技巧：

from xlrd import open_workbook
from xlutils.copy import copy
#將分類結果重新寫入原excel中
def write_to(data,file):
    print(len(data))
    xl =open_workbook(file)
    excel = copy(xl)
    sheet1 = excel.get_sheet(0)

    sheet1.write(0, 8, "formal")
    for i in range(0, len(data)):
        sheet1.write(i + 1, 8, data[i])

    excel.save(file)

數據處理源碼

import xlwt import pandas as pd #讀取源數據，獲取詩詞內容
def read_excel(file): data=pd.read_excel(file) content=data.content return content #詩詞形式獲取
def formal(content): formal_list=[] for it in content: ju_list=str(it).replace('\n','').replace('.','。').split('。') print(ju_list) if (len(ju_list)-1==8): if len(ju_list[0])==11: formal_list.append("五言律詩") print("五言律詩") elif len(ju_list[0])==15: formal_list.append("七言律詩") print("七言律詩") else: formal_list.append("無") print("無") elif len(ju_list)-1==4: if len(ju_list[0])==11: formal_list.append("五言絕句") print("五言絕句") elif len(ju_list[0])==15: formal_list.append("七言絕句") print("七言絕句") else: formal_list.append("無") print("無") else: if len(ju_list[0])==11: formal_list.append("五言") print("五言") elif len(ju_list[0]) == 15: formal_list.append("七言") print("七言") else: formal_list.append("無") print("無") return formal_list from xlrd import open_workbook from xlutils.copy import copy #將分類結果重新寫入原excel中
def write_to(data,file): print(len(data)) xl =open_workbook(file) excel = copy(xl) sheet1 = excel.get_sheet(0) sheet1.write(0, 8, "formal") for i in range(0, len(data)): sheet1.write(i + 1, 8, data[i]) excel.save(file) #獲取指定文件夾下的excel
import os def get_filename(path,filetype):  # 輸入路徑、文件類型例如'.xlsx'
    name = [] for root,dirs,files in os.walk(path): for i in files: if os.path.splitext(i)[1]==filetype: name.append(i) return name            # 輸出由有后綴的文件名組成的列表


if __name__ == '__main__': #獲取指定文件夾下的源數據
    file='data/' list=get_filename(file,'.xlsx') for it in list: newfile=file+it #獲取詩詞內容
        data=read_excel(newfile) #根據詩詞內容，獲取對應的詩詞形式
        formal_data=formal(data) #將詩詞形式重新寫入源數據
        write_to(formal_data,newfile)

結果展示

明天任務

1.曲牌名篩選出

2.飛花令爬取

3.找出詩句對應的“飛花令”

4.中文分詞，試圖將詩人個人經歷，逐個分段，梳理出這幾類關鍵信息：人物，時間，事件，地點。將文本抽取為規則化的數據格式

免責聲明！

本站轉載的文章為個人學習借鑒使用，本站對版權不負任何法律責任。如果侵犯了您的隱私權益，請聯系本站郵箱yoyou2525@163.com刪除。

猜您在找 python字符串以單詞形式反轉 NLTK 詞干提取、詞形還原 token:NLP之詞形還原【465】詞干提取與詞形還原我最在行詩詞精美詩詞37首柳永經典詩詞九首詞形變換和詞干提取工具（英文）漢語詩詞 LaTeX 排版樣式劍網三門派詩詞