URL很簡單,數據集分散開在一個URL頁面上,單個用手下載很慢,這樣可以用python輔助下載;
問題:很多國外的數據集,收到網絡波動的影響很大,最好可以添加一個如果失敗就繼續請求的邏輯,這里還沒有實現;
參考鏈接:
https://blog.csdn.net/sinat_36246371/article/details/62426444
代碼都是這位大神的,感謝,我再上面稍微改了一點點,加了異常處理。
''' downloading dataset on one html page ''' import requests from bs4 import BeautifulSoup archive_url = your_target_url def get_target_links(): r = requests.get(archive_url) soup = BeautifulSoup(r.content, 'html5lib') links = soup.findAll('a') video_links = [] #video_links = [archive_url + link['href'] forlink in links if (link['href'].endswith('atr') or link['href'].endswith('dat') or link['href'].endswith('hea') )] for link in links: try: if((link['href'].endswith('atr') or link['href'].endswith('dat') or link['href'].endswith('hea') )): video_links.append(archive_url + link['href']) except KeyError: print('keyerror, keep going!') for i in video_links: print(i, '\n') return video_links def download_target_series(video_links): failed_list = [] for link in video_links: file_name = link.split('/')[-1] file_name = ‘your_local_folder’ + file_name print("Downloading file:%s" % file_name) print(link) try: r = requests.get(link, stream=True) except Exception: failed_list.append(file_name.split('\\')[-1]) print('download failed. go to down next one\n') # download started with open(file_name, 'wb') as f: for chunk in r.iter_content(chunk_size=1024 * 1024): if chunk: f.write(chunk) print("%s downloaded!\n" % file_name) print("All videos downloaded!") print(failed_list) #record which one is failed to download return if __name__ == "__main__": target_links = get_target_links() download_target_series(target_links)