'''
從web抓取數據:
webbrowser:是python自帶的,打開瀏覽器獲取指定頁面.
requests:從因特網上下載文件和網頁.
Beautiful Soup:解析HTML,即網頁編寫的格式.
selenium:啟動並控制一個Web瀏覽器.selenium能夠填寫表單,並模擬鼠標在這個瀏覽器中點擊
'''
import webbrowser
webbrowser.open('http://inventwithpython.com/')
'''
利用requests模塊從Web下載文件:
requests模塊讓你很容易從Web下載文件,不必擔心一些復雜的問題,
諸如網絡錯誤、連接問題和數據壓縮
'''
###################################用requests.get()下載一個網頁####################################
import requests
res=requests.get('http://www.gutenberg.org/cache/epub/1112/pg1112.txt')
type(res)
res.status_code=requests.codes.ok
'''
檢查Response對象的status_code屬性,等於requests.codes.ok時表示一切都好
(HTTP協議中"OK"的狀態碼是200,404狀態碼表示"沒找到")
'''
len(res.text)
print(res.text[:250])
###################################檢查下載錯誤####################################
import requests
res=requests.get('http://inventwithpython.com/page_that_does_not_exist')
res.raise_for_status() ####下載成功了,就什么也不做;下載出錯,就拋出異常
##############################################
import requests
res=requests.get('http://inventwithpython.com/page_that_does_not_exist')
try:
res.raise_for_status()
except Exception as exc:
print('There was a problem:%s'%(exc))
###################################將下載的文件保存到硬盤####################################
import requests
res=requests.get('http://www.gutenberg.org/cache/epub/1112/pg1112.txt')
res.raise_for_status()
playFile=open(r'C:\Users\Administrator\Desktop\RomeoAndJuliet.txt','wb')
for chunk in res.iter_content(100000): ###每次循環迭代的字節數
print len(chunk)
playFile.write(chunk)
playFile.close()
###################################學習HTML的資源####################################
'''
HTML初學者指南:
http://htmldog.com/guides/html/beginner/
http://www.codecademy.com/tracks/web/
https://developer.mozilla.org/en-US/learn/html
'''
######HTML快速復習
<strong>Hello</strong>world! #####<strong>表明:標簽包圍的文本將使用粗體
Al''s free <a href="http://inventwithpython.com">Python books</a>
###################################查看網頁的HTML源代碼####################################
'''
在網頁任意位置點擊右鍵:選擇View Source或View page source
查看該頁的HTML文本
'''
'''
在Windows版的Chrome和IE中,開發者工具已經安裝了,可以按下F12,出現;
再次按下F12,可以讓開發者工具消失
'''
'''
不要用正則表達式來解析HTML:
嘗試用正則表達式來捕捉HTML格式的變化,非常繁瑣,容易出錯
專門用於解析HTML的模塊,諸如Beautiful Soup,將更不容易導致缺陷
http://stackoverflow.com/a/1732454/1893164/
'''
###################################使用開發者工具來尋找HTML元素####################################
'''
http://weather.gov/
郵政編碼為94105
通過開發者工具,找到對應代碼
'''
###################################從HTML創建一個BeautifulSoup對象####################################
import requests,bs4
res=requests.get('http://forecast.weather.gov/MapClick.php?lat=37.78833550000007&lon=-122.39552170000002#.WXazEmP9c_0')
res.raise_for_status()
noStarchSoup=bs4.BeautifulSoup(res.text)
type(noStarchSoup)
playFile=open(r'C:\Users\Administrator\Desktop\rest.html','wb')
for chunk in res.iter_content(100000): ###每次循環迭代的字節數
print len(chunk)
playFile.write(chunk)
playFile.close()
###################################用select()方法尋找元素####################################
'''
傳遞給select()方法的選擇器 將匹配...
soup.select('div') 所有名為<div>的元素
soup.select('#author') 帶有id屬性為author的元素
soup.select('.notice') 所有使用CSS class屬性名為notice的元素
soup.select('div span') 所有在<div>元素之內的<span>元素
soup.select('div > span') 所有直接在<div>元素之內的<span>元素,中間沒有其他元素
soup.select('input[name]') 所有名為<input>,並有一個name屬性,其值無所謂的元素
soup.select('input[type="button"]') 所有名為<input>,並有一個type屬性,其值為button的元素
'''
<div id="current_conditions-summary" class="pull-left">
<p class="myforecast-current">NA</p>
<p class="myforecast-current-lrg">60°F</p>
<p class="myforecast-current-sm">16°C</p>
</div>
<div id="comic">
<img src="//imgs.xkcd.com/comics/barrel_cropped_(1).jpg" title="Don't we all." alt="Barrel - Part 1" />
</div>
import bs4
exampleFile=open(r'C:\Users\Administrator\Desktop\rest.html')
exampleSoup=bs4.BeautifulSoup(exampleFile.read())
elems=exampleSoup.select('#current_conditions-summary')
type(elems)
len(elems)
type(elems[0])
elems[0].getText()
>>> elems[0].getText()
u'\nNA\n59\xb0F\n15\xb0C\n'
>>> str(elems[0])
'<div class="pull-left" id="current_conditions-summary">\n<p class="myforecast-current">NA</p>\n<p class="myforecast-current-lrg">59\xc2\xb0F</p>\n<p class="myforecast-current-sm">15\xc2\xb0C</p>\n</div>'
>>> >>> elems[0].attrs
{'id': 'current_conditions-summary', 'class': ['pull-left']}
#########################
pElems=exampleSoup.select('p')
>>> pElems[1]
<p>Your local forecast office is</p>
>>> pElems[2]
<p>
Severe thunderstorms will be possible over portions of the upper Midwest and Great Lakes Tuesday, Wednesday, and Thursday. Damaging winds, large hail, and heavy rainfall possible. Over the Desert Southwest and portions of the Rockies, Monsoonal moisture will lead to locally heavy rainfall and the threat for flash flooding into midweek.
<a href="http://www.wpc.ncep.noaa.gov/discussions/hpcdiscussions.php?disc=pmdspd" target="_blank">Read More ></a>
</p>
>>> pElems[1].getText()
u'Your local forecast office is'
###################################通過元素的屬性獲取數據####################################
import bs4
soup=bs4.BeautifulSoup(open(r'C:\Users\Administrator\Desktop\rest.html'))
spanElem=soup.select('span')[0]
>>> str(spanElem)
'<span class="sr-only">Toggle navigation</span>'
>>> spanElem.get('class')
['sr-only']
>>> spanElem.attrs
{'class': ['sr-only']}
>>> spanElem.get('id')==None
True
###################################用selenium模塊控制瀏覽器####################################
###################################啟動selenium控制的瀏覽器####################################
#####下載:http://getfirefox.com/
from selenium import webdriver
browser=webdriver.Firefox()
type(browser)
browser.get('http://inventwithpython.com')
###################################maplt.py####################################
import webbrowser,sys,pyperclip
if len(sys.argv)>1:
###Get address from command line:
address=' '.join(sys.argv[1:])
else:
address=pyperclip.paste()
webbrowser.open('https://www.google.com/maps/place/'+address)
