from urllib.parse import urljoin
import urllib.request
from bs4 import BeautifulSoup
import os
import datetime
import re
import errno
def mkdir_p(path): #遞歸創建多級目錄
try:
os.makedirs(path)
except OSError as exc: # Python >2.5 (except OSError, exc: for Python <2.5)
if exc.errno == errno.EEXIST and os.path.isdir(path):
pass
else: raise
def get_link(page): # 尋找鏈接的href
linkData = []
for page in page.find_all('td'):
links = page.select("a")
for each in links:
# if str(each.get('href'))[:1] == '/': 過濾if代碼
data=each.get('href')
linkData.append(data)
return(linkData)
def gain(url): #獲取網頁指定內容
page = urllib.request.urlopen(url).read()
soup = BeautifulSoup(page, 'lxml') #利用soup獲取網頁內容
links = get_link(soup) #獲取<a href= ? 內容
return links
def main():
url = 'https://www.tide-forecast.com/countries/China'
Web_Link=gain(url)
for Link in range(len(Web_Link)):
Link_Add = Web_Link[Link]
Link_One = re.split("/", Link_Add) #去除'/',將Link_Add變成數組
Link_Address = Link_One[2] #獲取數組第3位值
Link_Address = (Link_Address + '.js')
url_Tide = 'https://www.tide-forecast.com/tides/'
connet = urljoin(url_Tide, Link_Address) # 拼接網址路徑
file = os.path.join('D:\\TideData\\China' + "/" ) # 拼接絕對路徑
mkdir_p(file)
print(connet)
if os.path.isfile(file):
print('文件已存在')
else:
start = datetime.datetime.now().replace(microsecond=0) #計時工具
url = connet
wp = urllib.request.urlopen(url) #打開數據網頁數據
content = wp.read()
fp = open(file + Link_Address, "wb") #寫入指定文件夾
fp.write(content) #寫入數據
fp.close() #關閉文件
end = datetime.datetime.now().replace(microsecond=0)
print("用時: ", end='')
print(end - start)
if __name__ == '__main__':
main()