將python爬蟲部署到docker環境中:
一、本地環境准備
- main.py
#!/usr/bin/env python # -*- encoding: utf-8 -*- ''' @File : main.py @Time : 2022/02/10 10:21:43 @Author : Shydow @Version : 1.0 @Desc : None ''' # here put the import lib import requests from multiprocessing import Process import time import datetime import schedule import logging from service import launcher logging.basicConfig(format='%(asctime)s - %(pathname)s[line:%(lineno)d] - %(levelname)s: %(message)s', level=logging.INFO) def run_one(exec_date): print("task one start .. ") print(exec_date) timestamp = int(time.time() * 1000) print(timestamp) time.sleep(10) print("tsak one end .. ") def run_two(exec_date): print("task two start .. ") print(exec_date) timestamp = int(time.time() * 1000) print(timestamp) time.sleep(10) print("task two end .. ") def daily_run(): exec_date = date.today().isoformat() p1 = Process(target=run_one, args=(exec_date, )) p1.start() p2 = Process(target=run_two, args=(exec_date, )) p2.start() if __name__ == '__main__': schedule.every().hour.at(":05").do(daily_run) # 每個小時的第5min執行任務 while True: schedule.run_pending() time.sleep(1)
-
service.py
#!/usr/bin/env python # -*- encoding: utf-8 -*- ''' @File : service.py @Time : 2022/02/10 10:23:33 @Author : Shydow @Version : 1.0 @Desc : None ''' # here put the import lib def launcher(current_time): print(current_time)
-
requirements.txt
urllib3 DingDingBot requests PySocks==1.7.1 clickhouse-driver==0.2.0 pandas==0.25.1 numpy==1.16.5 schedule==1.1.0 hdfs==2.6.0
二、Dockerfile
# author: Shydow # date : 2022-02-10 # desc : spider test dockerfile FROM python:3.7.3 # 將當前目錄下的文件copy到容器的/spider_deploy目錄下 ADD ./ /spider_deploy # 設置容器內工作路徑為/spider_deploy WORKDIR /spider_deploy # 安裝運行所需要的python依賴 RUN pip install -i https://pypi.tuna.tsinghua.edu.cn/simple -r requirements.txt && \ wget https://nodejs.org/dist/v10.16.0/node-v10.16.0-linux-x64.tar.xz && \ tar xf node-v10.16.0-linux-x64.tar.xz -C /opt/ && \ rm -rf node-v10.16.0-linux-x64.tar.xz # 添加nodejs環境變量 ENV PATH=$PATH:/opt/node-v10.16.0-linux-x64/bin # 修改容器時區和時間 RUN cp /usr/share/zoneinfo/Asia/Shanghai /etc/localtime && \ echo 'Asia/Shanghai' >/etc/timezone # 在run的時候會執行該命令 CMD ["python", "main.py"]
三、構建鏡像啟動
# 選擇指定目錄,將所有腳本發到該目錄下 cd /app/spider/spider_deploy # 構建鏡像,后面是有一個 . sudo docker build -t spider_test . # 啟動鏡像 sudo docker run -d --name spider --add-host cdh01:172.23.255.11 --add-host cdh02:172.23.255.12 --add-host cdh03:172.23.255.13 --add-host cdh04:172.23.255.14 spider_test