BS4庫詳解


  1 from bs4 import BeautifulSoup
  2 
  3 
  4 
  5 
  6 html = """
  7 <html><head><title>This is a python demo page</title></head>
  8 <body>
  9 <p class="title"><a>The demo python introduces several python courses.</a></p>
 10 <p class="course">Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses:
 11 <a href="http://www.icourse163.org/course/BIT-268001" class="py1" id="link1"><b class="element">Basic Python</b></a> and <a href="http://www.icourse163.org/course/BIT-1001870001" class="py2" id="link2">Advanced Python</a>.</p>
 12 </body></html>
 13 """
 14 
 15 soup = BeautifulSoup(html,'lxml')
 16 #基本使用
 17 # print(soup.prettify())
 18 # print(soup.title.string)
 19 
 20 #標簽選擇器
 21 #選擇元素
 22 # print(soup.title)
 23 # print(type(soup.title))
 24 # print(soup.head)
 25 # print(soup.p)#返回第一個Tag
 26 #
 27 # #獲取名稱
 28 #
 29 # print(soup.title.name)
 30 #
 31 # #獲取屬性
 32 # print(soup.a.attrs['href'])
 33 # print(soup.a['href'])
 34 #
 35 # #獲取內容
 36 # print(soup.p.string)
 37 #
 38 # #嵌套選擇
 39 # print(soup.head.title.string)
 40 
 41 #子節點和子孫節點
 42 # print(soup.body.contents)#獲取子節點,返回列表類型
 43 # print(soup.body.children) #返回迭代器類型
 44 # for i,child in enumerate(soup.body.children):
 45 #     print(i,child)
 46 
 47 # print(soup.body.descendants) #子孫節點,返回迭代類型
 48 # for i,child in enumerate(soup.body.descendants):
 49 #     print(i,child)
 50 
 51 #父節點和祖先節點
 52 # print(soup.a.parent)
 53 #
 54 # print(list(enumerate(soup.a.parents)))
 55 
 56 #兄弟節點
 57 # print(list(enumerate(soup.a.next_siblings)))
 58 # print(list(enumerate(soup.a.previous_siblings)))
 59 
 60 #標准選擇器
 61 #find_all(name,attrs,recursive,text,**kwargs) 可根據標簽名、屬性、內容查找文檔
 62 #name
 63 # print(soup.find_all('p'))
 64 # print(type(soup.find_all('p')[0]))
 65 # for i in soup.find_all('p'):
 66 #     print(i.find_all('a')) #嵌套選擇
 67 
 68 # #attrs
 69 # print(soup.find_all(attrs={'href':"http://www.icourse163.org/course/BIT-268001"}))
 70 # print(soup.find_all(attrs={'id':'link1'}))
 71 #
 72 # print(soup.find_all(id='link1'))
 73 # print(soup.find_all(class_='py1'))
 74 #
 75 # #text查找內容
 76 # print(soup.find_all(text='This is a python demo page'))#用來做內容匹配
 77 #
 78 # #find(name,attrs,recursive,text,**kwargs)
 79 # #用法一樣,find只是返回單個元素,find_all返回所有元素
 80 # print(soup.find('p',attrs={'class':'course'}))
 81 # print(type(soup.find('p')))
 82 
 83 
 84 #CSS選擇器,返回列表
 85 #通過select()直接傳入CSS選擇器即可完成選擇
 86 #選擇class屬性就直接用'.'代替,例:class=‘course’--》.course;#代表id
 87 # print(soup.select('.course .py1'))
 88 # print(soup.select('p a'))#嵌套選擇
 89 # print(soup.select('#link1 .element'))
 90 # print(type(soup.select('p')[0]))
 91 #
 92 # #嵌套選擇
 93 # for p in soup.select('p'):
 94 #     print(p.select('a'))
 95 #
 96 
 97 #獲取屬性
 98 for p in soup.select('p'):
 99     print(p['class'])
100     print(p.attrs['class'])
101 
102 
103 #獲取內容
104 for p in soup.select('p'):
105     print(p.get_text())

 


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM