之前用过openpyxl库保存数据到Excel文件写入不了,换用xlsxwriter
批量插入图片到Excel表中
1 import os 2 import requests 3 import re 4 from openpyxl import load_workbook 5 import xlsxwriter 6 from multiprocessing.dummy import Pool as ThreadPool 7 from openpyxl.drawing.image import Image 8 9 def spider(url): 10 headers = { 11 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'} 12 html = requests.get(url, headers, timeout=None) 13 pic_url = re.findall('class="product-image">.*?<img src="(.*?)" height="', html.text, re.S) 14 sku = re.findall('q=(\d+)', url, re.S)#正则匹配链接后面的sku 15 if pic_url != []: 16 print('正在下载'+sku[0]+'图片,图片地址:' + pic_url[0]) 17 pic = requests.get(pic_url[0]) 18 dir = cwd + '\\images\\' + sku[0] + '.jpg' 19 # print(dir) 20 with open(dir, 'wb') as file: 21 file.write(pic.content) 22 else: 23 if sku !=[]: 24 print('没有找到' + sku[0]+'产品') 25 No_images.append(sku[0]) 26 #之前用过openpyxl创建新的Excel文件,但是写入不了,之后换用xlsxwriter保存数据到Excel 27 def save_excel(sku): 28 print(sku) 29 wb1 = xlsxwriter.Workbook(cwd + '\\' + 'No_images.xlsx') 30 ws1 = wb1.add_worksheet() 31 ws1.write(0, 0, 'No_images_sku') 32 for i in range(1,len(sku)+1): 33 ws1.write(i, 0, sku[i-1]) 34 wb1.close() 35 print('保存没有图片的sku成功!') 36 37 #插入图片到Excel 38 def insert_images(sku): 39 wb = load_workbook(path) 40 ws = wb.worksheets[0] 41 img_path = cwd + '\\images\\'+sku+'.jpg' 42 print(img_path) 43 ws.column_dimensions['H'].width = 11#设置单元格列宽 44 try: 45 img = Image(img_path) 46 #设置图片大小 47 img.width = 80 48 img.height = 96 49 # print(img) 50 for i in range(1,ws.max_row+1): 51 if ws.cell(i,2).value == int(sku): 52 ws.row_dimensions[i].height = 88#设置单元格行高 53 ws.add_image(img,'H'+str(i)) 54 wb.save(path) 55 except(FileNotFoundError) as e: 56 print(e) 57 wb.close()#每次插入一张图片需要关闭一次Excel表 58 59 if __name__ == '__main__': 60 cwd=os.getcwd() 61 path = cwd + '\\'+'图片测试.xlsx' 62 wb =load_workbook(path) 63 ws = wb.worksheets[0] 64 pool =ThreadPool(50)#开启多少个进程,四核电脑 65 urls = [] 66 No_images = [] 67 for i in range(1, ws.max_row+1):#通过循环将Excel数据读取出来 68 sku = ws.cell(i,2).value 69 if sku !=None: 70 print('正在爬取第'+str(i)+'个sku图片') 71 url = 'http://www.fulchic.com/catalogsearch/result/?q=' + str(sku) 72 urls.append(url) 73 pool.map(spider,urls)#多线程工作,其中,spider是爬虫函数名,urls是个爬取链接列表 74 pool.close() 75 pool.join() 76 #保存图片到本地 77 save_excel(No_images) 78 #循环写入保存图片,并设置单元格和图片大小 79 for i in range(1, ws.max_row + 1): # 通过循环将Excel数据读取出来 80 sku = ws.cell(i, 2).value 81 if sku != None: 82 print('正在保存第' + str(i) + '个sku图片') 83 insert_images(str(sku))