關於爬蟲-爬取下載頁面下所有的文件夾以及子文件夾里的文件


我寫這個只是針對這個資源網站,for循環嵌套效率感覺很低,菜鳥一枚,如果有大佬有好的建議以及改成方法,希望可以提出,我一定會吸取采納。感謝!

  from urllib import request
  import re  # 使用正則表達式
  from lxml import etree
  import requests
  requests.adapters.DEFAULT_RETRIES = 5 # 增加重連次數
  s = requests.session()
  s.keep_alive = False # 關閉多余連接
  import os
  
  basepath = "http://www.glass.umd.edu/"
  path="http://www.glass.umd.edu/Download.html"

  def getResponse(url):
      while True:
          try:
              head = {}
              head['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36'
              url_request = request.Request(url,headers=head)
              url_response = request.urlopen(url_request,timeout=30)
              break
          except:
              print("url_response出錯了")
      return url_response  # 返回這個對象
  
  def get_file(data):
      html = etree.HTML(data)
      urllist=html.xpath("//div[@class='demo']/a/@href")
      #print(urllist)
      for i in urllist:
         url = basepath + i
         data1 = getData(url)
         html1 = etree.HTML(data1)
         urllist1 = html1.xpath("//td/a/@href")
         del urllist1[0]
         for j in urllist1:
              savepathname = i +"/"+ j
              print("savepathname is %s" % savepathname)
              if ".tif" in j or ".txt" in j:
                  downLoad(url,j,savepathname)
                  #print(j)
                  pass
              else:
                  if(url[-1]==r'/'):
                      url1 = url + j
                  else:
                      url1 = url +"/"+ j
                  data2 = getData(url1)
                  #print(url1)
                  html2 = etree.HTML(data2)
                  urllist2 = html2.xpath("//td/a/@href")
                  del urllist2[0]
                  for z in urllist2:
                      savepathname1 = savepathname + z
                      print("savepathname1 is %s" % savepathname1)
                      if ".dat" in z or ".hdr" in z or ".hdf" in z or ".jpg" in z or ".xml" in z:
                          downLoad(url1,z,savepathname1)
                          #print(z)
                          pass
                      else:
                          url2 = url1 + z
                          data3 = getData(url2)
                          html3 = etree.HTML(data3)
                          urllist3 = html3.xpath("//td/a/@href")
                          del urllist3[0]
                          for k in urllist3:
                              print("k is %s" % k)
                              if ".hdf" in k or ".jpg" in k or ".xml" in k:
                                  downLoad(url2,k,savepathname1)
                                  #print(k)
                                  pass
                              else:
                                  print("找不到了,繼續往下尋找。")
  
  def downLoad(jpgUrl,name, savepathname):
      # request.urlretrieve(jpg_link, path)
      maxTryNum = 20
      if not os.path.exists("E://url_D/" + savepathname):
          os.makedirs("E://url_D/" + savepathname)
      print("E://url_D/" + savepathname)
      for tries in range(maxTryNum):
          try:
              print("開始保存")
              res=s.get(jpgUrl,timeout=30)
              with open("E://url_D/" + savepathname+name,'wb+') as f:
                  f.write(res.content)
                  print("保存成功")
              #request.urlretrieve(jpgUrl, '%s.jpg' % n)
          except Exception as e:
              if tries < (maxTryNum - 1):
                  continue
              else:
                  print("出現異常%s" % e)
                  pass
  
        
  def getData(path):
      http_response = getResponse(path)  # 拿到http請求后的上下文對象(HTTPResponse object)
      # print(http_response.read().decode('utf-8'))
      data = http_response.read().decode('utf-8')
      return data
  
  def main():
      data = getData(path)
      get_file(data)
  
  if __name__ == '__main__':
      main()
      print("爬取完畢")


免責聲明!

本站轉載的文章為個人學習借鑒使用,本站對版權不負任何法律責任。如果侵犯了您的隱私權益,請聯系本站郵箱yoyou2525@163.com刪除。



 
粵ICP備18138465號   © 2018-2025 CODEPRJ.COM