使用環境
- Python3.6
- Django1.11.11
- Xadmin 后台管理插件
- MySQL數據庫
文件目錄結構
文件目錄結構
數據表結構 models.py
from django.db import models
from django.contrib.auth.models import AbstractUser
import time
"""
學習到的知識:
1) 一個表需要關聯多次同一個表時,需要重命名正向查詢(related_name)和反向查詢明字(related_query_name)
2) 索引的使用:db_index=True
"""
# Create your models here.
# 用戶表
class UserInfo(AbstractUser):
gender = models.CharField('性別', max_length=32) # 性別
phone = models.CharField('手機號', max_length=32) # 手機號
name = models.CharField('姓名',max_length=32) # 姓名
ID_number = models.CharField('身份證號', max_length=32) # 身份證號
def __str__(self):
return str(self.pk)+'----'+str(self.username)
class Meta:
verbose_name_plural = '用戶表'
# 車站表
class Station(models.Model):
id = models.IntegerField(primary_key=True)
station_name = models.CharField('車站名稱', max_length=32) # 車站名稱
english = models.CharField('英文編碼', max_length=32, db_index=True) # 英文編碼
spell = models.CharField('拼音', max_length=32) # 拼音
spell_brief = models.CharField('拼音簡', max_length=32) # 拼音簡
city = models.ForeignKey(verbose_name='關聯城市', to='City', db_index=True) # 關聯城市
def __str__(self):
return str(self.station_name)
class Meta:
verbose_name_plural = '車站表'
# 城市
class City(models.Model):
city_name = models.CharField('城市列表', max_length=32) # 城市列表
def __str__(self):
return str(self.city_name)
class Meta:
verbose_name_plural = '城市'
# 列車表
class Train(models.Model):
train_size = models.CharField('列車號', max_length=32, db_index=True) # 列車號
train_coding = models.CharField('列車編碼', max_length=32) # 列車編碼
# 一個表需要關聯多次同一個表時,需要重命名正向查詢(related_name)和反向查詢明字(related_query_name)
start_stand = models.ForeignKey(verbose_name='起始站', to='Station', related_name='related_start_stand') # 起始站
terminus = models.ForeignKey(verbose_name='終點站', to='Station', related_name='related_terminus') # 終點站
depart = models.CharField('始發時間', max_length=32) # 始發時間
arrive = models.CharField('到達時間', max_length=32) # 到達時間
coach_num = models.CharField('車廂數', max_length=32, default=7) # 車廂數
station = models.ManyToManyField(verbose_name='關聯列車進站時間表', to='Station', through='Station2Train',
through_fields=('train', 'station'))
def __str__(self):
return str(self.train_size)
class Meta:
verbose_name_plural = '列車表'
# 列車進站時間表 車站——列車多對多表
class Station2Train(models.Model):
station = models.ForeignKey(verbose_name='關聯車站表', to='Station') # 關聯車站表
train = models.ForeignKey(verbose_name='關聯列車表', to='Train') # 關聯列車表
station_next = models.CharField('站次(這趟車第幾次經過)', max_length=32) # 站次(這趟車第幾次經過)
arrive_time = models.CharField('到達時間', max_length=32) # 到達時間
depart_time = models.CharField('出發時間', max_length=32) # 出發時間
distance = models.CharField('和上一站的距離', max_length=32) # 和上一站的距離
is_state = models.CharField('是起終停', max_length=32) # 是起終停
def __str__(self):
return 'station' + '的到達時間:' + str(self.arrive_time) + ' 出發時間' + str(self.depart_time)
class Meta:
unique_together = ("station", "train")
verbose_name_plural = '列車進站時間表'
# 車座表
class Seat(models.Model):
choices = (
(1, '商務座'),
(2, '一等座'),
(3, '二等座'),
(4, '高級軟卧'),
(5, '高級硬卧'),
(6, '硬座'),
(7, '無座')
)
train = models.ForeignKey(verbose_name='關聯列車表', to='Train',null=True,db_index=True) # 關聯列車表
coach_size = models.CharField('車廂號', max_length=32) # 車廂號
seat_type = models.IntegerField('座位類型', choices=choices) # 座位類型
seat_size = models.CharField('座位號', max_length=32) # 座位號
is_sell = models.CharField('出售情況', max_length=64, null=True,db_index=True) # 出售情況
def __str__(self):
return str(self.seat_type)
class Meta:
verbose_name_plural = '車座表'
# 郵箱驗證碼
class EmailVerifyRecord(models.Model): # 郵箱驗證碼
code = models.CharField(max_length=20, verbose_name=u"驗證碼")
email = models.EmailField(max_length=50, verbose_name=u"郵箱")
send_type = models.CharField(choices=(('register', u"注冊"), ('forget', u"找回密碼")), max_length=10)
send_time = models.DateTimeField(auto_now=True) # 獲取時間
class Meta:
verbose_name = u"郵箱驗證碼"
verbose_name_plural = verbose_name
# 車票表
class Ticket(models.Model):
ticket = models.CharField('車票號', max_length=32, primary_key=True) # 車票號
train_size = models.ForeignKey(verbose_name='關聯列車號', to='Train') # 列車號
coach_size = models.CharField('車廂號', max_length=32) # 車廂號
seat_size = models.CharField('座位號', max_length=32) # 座位號
user = models.ForeignKey(verbose_name='關聯用戶表', to='UserInfo') # 關聯用戶表
price = models.CharField('價格', max_length=32) # 價格
pay_type = models.CharField('支付方式', max_length=32) # 支付方式
depart_time = models.CharField('出發時間', max_length=32) # 出發時間
arrive_time = models.CharField('到達時間', max_length=32) # 到達時間
depart_stand = models.CharField('出發站', max_length=32) # 出發站
arrive_stand = models.CharField('到達站', max_length=32) # 到達站
buy_time = models.CharField('購買時間', max_length=32) # 購買時間
is_quit = models.CharField('是否退票', max_length=32) # 是否退票
def __str__(self):
return str(self.ticket)
class Meta:
verbose_name_plural = '車票表'
# 爬蟲IP表
class IP(models.Model):
ip = models.CharField('IP地址', max_length=32)
port = models.CharField('端口號', max_length=32)
expire_time = models.CharField('過期時間', max_length=32)
city = models.CharField('地區', max_length=32)
def __str__(self):
return str(self.ip) + ':' + str(self.port)
class Meta:
verbose_name_plural = '爬蟲IP表'
爬取所需代理ip ip.py
(爬取可能失效 2019-7-4,自己可以先學一下request)
import requests
from app01 import models
class Get_IP():
# 校驗
def __init__(self):
pass
def select_ip(self):
ip_obj = models.IP.objects.filter(id=1).first()
return ip_obj
def zhimaruanjian(self,url=None):
"""
http://webapi.http.zhimacangku.com/getip?
使用的芝麻代理 http://webapi.http.zhimacangku.com
"""
requests.session()
if not url:
url = 'http://webapi.http.zhimacangku.com/getip?num=1&type=2&pro=&city=0&yys=0&port=1&pack=自己的pack號&ts=1&ys=0&cs=1&lb=1&sb=0&pb=4&mr=1®ions='
r = requests.get(url)
res = r.json()
if res['code'] == 0:
ip = res['data'][0]['ip']
port = res['data'][0]['port']
expire_time = res['data'][0]['expire_time']
city = res['data'][0]['city']
res = models.IP.objects.filter(id=1).first()
print('獲取到新ip %s'%(str(ip) +':'+ str(port)))
if res:
models.IP.objects.filter(id=1).update(ip=ip, port=port, expire_time=expire_time, city=city)
else:
res = models.IP(ip=ip, port=port, expire_time=expire_time, city=city)
res.save()
else:
return True
if __name__ == '__main__':
res = Get_IP()
res.zhimaruanjian()
數據的定制爬取
(截止2019-7-4,數據統計:爬取車站總數2863個,涉及城市1260個,車站停靠數86037個,自制座位數5244727條( 車站停靠數86037個 X 列車數7節 X 每車廂100座位))
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import os
from django.core.wsgi import get_wsgi_application
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "cp12306.settings")
application = get_wsgi_application()
from app01 import models
import xlrd
import json
import time
import requests
import random
from lxml import etree
from django.db.models import Count
from app01.myfile.ip import Get_IP
from concurrent.futures import ThreadPoolExecutor # 設置多線程池
Get_IP = Get_IP()
"""
用到是知識點:
1) bulk_create 批量添加數據
2) xlrd 操作Excel表
3) list(set(city_name_list)) 列表 利用set的自動去重功能
4) 分組和聚合函數使用:
train_list = models.Train.objects.annotate(a = Count('station2train__train')).values('train_size','a')
"""
# 添加城市
def city():
data = xlrd.open_workbook('火車站信息表.xlsx') # 打開Excel表
city_name_list = data.sheets()[0].col_values(5) # 獲取需要的數據
query_list = []
for x, i in enumerate(list(set(city_name_list)), 1):
City_obj = models.City(id=x, city_name=i)
if not City_obj in query_list:
query_list.append(City_obj)
try:
print('城市列表添加完成!')
models.City.objects.bulk_create(query_list)
except:
print('城市列表已經存在!')
# 添加車站
def station():
# 查詢城市代碼id
city_list = models.City.objects.all().values('id', 'city_name')
city_dic = {city.get('city_name'): city.get('id') for city in city_list}
# 添加
data = xlrd.open_workbook('火車站信息表.xlsx').sheets()[0] # 打開Excel表
query_list = []
for i in range(data.nrows):
data_col = data.row_values(i) # 獲取excel一行數據
station_obj = models.Station(id=i + 1, station_name=data_col[1], english=data_col[2], spell=data_col[3],
spell_brief=data_col[4], city_id=city_dic.get(data_col[5]))
query_list.append(station_obj) # 把所有對象,添加到列表中
try:
pass
models.Station.objects.bulk_create(query_list)
print('車站列表添加完成!')
except:
print('車站列表已經存在!')
#
# # 添加座位類型
# def seat_type():
# seat_list = ['商務座', '一等座', '二等座', '高級軟卧', '高級硬卧', '硬座', '無座']
# query_list = []
# for id, seat_type in enumerate(seat_list, 1):
# query_list.append(models.Seat_Type(id=id, seat_type=seat_type))
#
# # 批量插入數據庫之bulk_create()
# try:
# models.Seat_Type.objects.bulk_create(query_list)
# print('座位類型添加完成!')
# except:
# print('座位類型已經存在!')
# 爬取列車數據
def pa(station_dic,train_size):
"""
需要的數據:
本列車
起始站、
終點站、
始發時間、
到達時間、
站次、
途徑站中到達時間、
途徑站中出發時間、
到達站、
和上一站距離、
狀態是起終停
:param train_size:
:param train_coding:
:return:
"""
# 代理ip 地址,隨機IP地址
def get_ip():
ip_obj = Get_IP.select_ip()
ip = ip_obj.ip + ":" + ip_obj.port
proxies = {
'http': ip,
'https': ip
}
print(proxies)
return proxies
url = 'http://checi.114piaowu.com/{}'.format(train_size)
requests.Session()
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Accept-Encoding': 'gzip, deflate',
'Cookie': 'CLIENT_SOURCE=baidu_www.baidu.com; CLIENT_FIRST_ENTER=pc_shike; tostation=%E5%88%B0%E8%BE%BE%E5%9F%8E%E5%B8%82; UM_distinctid=16bb7da72431fe-07a5e239d1d8e6-37677e02-1aeaa0-16bb7da7244977; JSESSIONID=DF0894D3C3B6127C656BF6ADF714674E; fromstation=%E9%98%BF%E5%B0%94%E5%B1%B1; CLIENT_LAST_ENTER=pc_checi',
'Host': 'checi.114piaowu.com',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
def get_url():
r = requests.get(url=url, headers=headers, proxies=get_ip(), timeout=10)
print(r.status_code)
return r
try:
try:
try:
try:
r = get_url()
except requests.exceptions.ConnectTimeout or requests.exceptions.ProxyError: # 代理無效錯誤
print('代理ip無效')
Get_IP.zhimaruanjian()
return False
except requests.exceptions.ReadTimeout:# 讀取超時錯誤
print('讀取超時')
return False
except requests.exceptions.TooManyRedirects:
print('ip被限制')
try:
Get_IP.zhimaruanjian()
r = get_url()
except requests.exceptions.ProxyError:
try:
Get_IP.zhimaruanjian()
r = get_url()
except:
print('未知錯誤!')
return False
except :
print('未知錯誤!')
Get_IP.zhimaruanjian()
return False
# 對獲取的頁面進行解析
selector = etree.HTML(r.text)
func_dic = {}
# 把需要的數據寫入字典
try:
res = selector.xpath("//dd[@class='line']/ul/li/a/text()")
func_dic['start_stand'] = station_dic[res[0]] # 起始站
func_dic['terminus'] = station_dic[res[1]] # 終點站
res = selector.xpath("//dd[@class='line']/ul/li/text()")
func_dic['depart'] = res[0] # 始發時間
func_dic['arrive'] = res[1].split('(')[0] # 到達時間
res = selector.xpath("//div[@class='list']/table//tr")
func_dic['data'] = {}
except:
return False
number = 0
# 把數據進行封裝 格式 func_dic = {'start_stand':start_stand, ... 'data':{id:[]}}
for each in res:
numbers = each.xpath("./td[5]/text()")
if numbers:
numbers = numbers[0]
# 獲取車站代碼對應的id
try:
station_next = each.xpath("./td[1]/text()")[0] # 站次
station = station_dic[each.xpath("./td[2]/a/text()")[0]] # 到達站
arrive_time = each.xpath("./td[3]/text()")[0] # 到達時間
depart_time = each.xpath("./td[4]/text()")[0] # 出發時間
if numbers == '--':
numbers = 0
# func_list.append()
distance = int(numbers) # 和上一站的距離
# distance = int(numbers) - int(number) # 和上一站的距離
# number = numbers
if station == func_dic['terminus']:
is_state = '終' # 是起終停
elif each.xpath("./td[3]/text()")[0] == '--':
is_state = '起' # 是起終停
else:
is_state = '暫' # 是起終停
except KeyError:
return False
func_dic['data'][station_next] = [station, arrive_time, depart_time, distance, is_state]
return func_dic
# 保存車列表
def train_save(dic):
train_size = dic.get('train_size') #列車號
train_coding = dic.get('train_coding') #列車編碼
start_stand = models.Station.objects.filter(id=dic.get('start_stand')).first() #起始站
terminus = models.Station.objects.filter(id=dic.get('terminus')).first() #終點站
depart = dic.get('depart') #始發時間
arrive = dic.get('arrive') #到達時間
train_obj = models.Train.objects.create(train_size=train_size,train_coding=train_coding,
start_stand=start_stand,terminus=terminus,depart=depart,arrive=arrive )
data = dic.get('data') #多對多數據
for key,val in data.items():
station_next = key #站次
station = models.Station.objects.filter(id=val[0]).first() #起始站 #關聯列車表
train = train_obj #關聯車站表
arrive_time = val[1] #到達時間
depart_time = val[2] #出發時間
distance = val[3] #和上一站的距離
is_state = val[4] #是起終停
models.Station2Train.objects.create(station_next=station_next,station=station
,train=train,arrive_time=arrive_time,depart_time=depart_time,distance=distance,is_state=is_state)
print('{}次列車信息存入成功!'.format(train_size))
# 獲取車表
def train():
# 獲取車站信息字典
info = []
errors = []
station_list = models.Station.objects.all().values('station_name','id')
station_dic = {city.get('station_name'): city.get('id') for city in station_list}
# 打開車次信息文件
with open('train_list.js', 'rb') as f:
data = json.loads(f.read()).get('2019-07-16')
# 把所有列車信息轉換為字典格式{車次:車次編號}
data_list = []
for val in data.values():
for vals in val:
data_list.append(vals)
data_dic = {dic.get('station_train_code').split('(')[0]: dic.get('train_no') for dic in data_list}
for i,(val,key) in enumerate(data_dic.items(),1):
# 循環爬取數據
print('正在爬第{}趟{}列車..'.format(i,val))
train_obj = models.Train.objects.filter(train_size=val).first()
# 判斷列車是否已存在
if not train_obj:
res_dic = pa(station_dic,val)
# 存入數據庫
if res_dic:
res_dic['train_size'] = val
res_dic['train_coding'] = key
train_save(res_dic)
else:
print('正在爬第{}趟{}列車數據報錯!'.format(i, val))
train_obj = models.Train.objects.filter(train_size=val).delete()
errors.append(val)
else:
print('正在爬第{}趟{}列車數據已存在!'.format(i, val))
info.append(val)
print('已存在列車數據:',info)
print('不存在列車數據:',errors)
# 添加座位表
def seat():
# 查詢到所有城市列表
train_obj = models.Train.objects.all()
train_list = models.Train.objects.annotate(a = Count('station2train__train')).values('train_size','a')
res_dic = {train.get('train_size'): train.get('a') for train in train_list}
id = 0
for index,train in enumerate(train_obj,1):
# 查詢每列車有多少站點
train_size = train.train_size
print(index,train_size,res_dic.get(train_size))
sell = ''.join(['1' for i in range(res_dic.get(train_size))])
print(sell)
# Seat_list列表
query_list = []
# 7節車廂
for coach_size in range(1,8):
# 100座位
for seat_size in range(1,101):
id+=1
Seat_obj = models.Seat(id=id,train=train, coach_size=coach_size,seat_type=coach_size,seat_size=seat_size,is_sell=sell)
query_list.append(Seat_obj)
try:
models.Seat.objects.bulk_create(query_list)
print('座位列表添加成功{}條!'.format(id))
except:
print('座位列表{}已經存在!'.format(id))
def get_ip():
import pymysql
host = '106.75.31.89'
user = 'root'
password = 'Aa428912'
data = 'Ip_conn'
port = 3306
connect = pymysql.connect(host, user, password, data, port, charset='utf8') # 數據庫連接參數
cursor = connect.cursor(pymysql.cursors.DictCursor) # 獲取一個游標
cursor.execute('select ip from ip')
data = cursor.fetchall()
data_list = []
for i in data:
data_list.append(i['ip'])
print(data_list)
print(len(data_list))
return data_list
# 爬取距離
def pa_distance(station_name,station_name_1,ip):
print(ip)
proxies = {
"http": "http://{}".format(ip),
"https": "http://{}".format(ip),
}
url = 'http://juli.liecheshike.com/從{}到{}有多遠'.format(station_name,station_name_1,proxies=proxies)
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Accept-Encoding': 'gzip, deflate',
'Cookie': 'safedog-flow-item=9C74D71A66F8B17A458732499BAEC7FF; ASPSESSIONIDCADCQTDB=JAMEOIKBLEPLECFDNHNCADNH; __51cke__=; ASPSESSIONIDCABBRQAC=KAGHCJNAIICFAABHNEOKPLIA; __tins__1516098=%7B%22sid%22%3A%201562545736853%2C%20%22vd%22%3A%203%2C%20%22expires%22%3A%201562547696493%7D; __51laig__=14',
'Host': 'juli.liecheshike.com',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
requests.Session()
r = requests.get(url=url,headers=headers,timeout=30)
selector = etree.HTML(r.text)
res = selector.xpath("//h3/text()")
return res[0].split('公里')[0]
# 距離
def get_distance(data):
"""
邏輯:先查出列車數量,再去循環數量的次數,按照數量為列車ID去查找數據,再去爬取距離,存入數據庫
:return:
"""
# 獲取到車次數量
def get(i,sum = 0):
train_obj = models.Station2Train.objects.filter(train_id=i).values_list('station__station_name','pk')
station_name=""
distances = 0
print('——————第{}站——————'.format(i))
for index,train_data in enumerate(train_obj,1):
sum +=1
print('第',i,'的',sum,'個')
if index>1:
station_name_1=train_data[0]
while True:
# try:
distance = pa_distance(station_name,station_name_1,ip=random.choice(data))
break
# except Exception as e:
# if e=='list index out of range':
# distance=1
# break
# print('第', i, '的', sum, '個錯誤:%s'%e)
distances += int(distance)
print(distances)
models.Station2Train.objects.filter(train_id=i,station_next=index).update(distance=distances)
station_name = station_name_1
else:
station_name = train_data[0]
models.Station2Train.objects.filter(train_id=i,station_next=index).update(distance='0')
res = time.time()
sumber = models.Train.objects.all().count()
TP = ThreadPoolExecutor(max_workers=1)
for i in range(45,sumber+1):
# get(i)
TP.submit(get,i)
print(sumber)
print(time.time()-res)
if __name__ == '__main__':
pass
Get_IP.zhimaruanjian()
city() # 添加城市
station() # 添加車站
train() # 爬取列車表
seat() # 添加座位表
data = get_ip()
get_distance(data)
額外文件 (博客園無法上傳大文件,給個外鏈接)
文件列表:
火車站信息表.xlsx
train_list.js
下載地址:
小強雲盤分享鏈接:http://www.liqianglog.top:8002/home/share_link/K6X8028O08 提取密碼為:353C 點擊分享快去分享給好友啵~~
(如果失效,請聯系博主,1206709430@qq.com)