python + mysql 清洗数据


1、创建表

CREATE TABLE `test_sdr_daily_action` (
    `log_date` DATE NOT NULL
    ,`sales_id` BIGINT (20) NOT NULL
    ,`sales_name` VARCHAR(256) NOT NULL
    ,`contacted_prospect` INT (11) NOT NULL DEFAULT 0
    ,`total_call` INT (11) NOT NULL DEFAULT 0
    ,`success_call` INT (11) NOT NULL DEFAULT 0
    ,`success_call_time` INT (11) NOT NULL DEFAULT 0
    ,`average_call_time` DECIMAL(10, 4) NOT NULL DEFAULT 0
    )
View Code

 2、书写python脚本

#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""

Create on ''
Update on ''
Author:guangxu.qi
"""

import sys
reload(sys)
sys.setdefaultencoding('utf-8')
sys.path.append("/data/dp/dp_common/common/dbase")
from myDbase import Mysql
from myEngine import MyEngine
import datetime
import pandas as pd

lucrativ = Mysql("LUCRATIV")
lucrativ.getAll("set time_zone='PST8PDT'")

def get_data(log_date):
    sql = """
    SELECT user_id sales_id
        ,concat(uu.first_name,' ',uu.last_name) sales_name
        ,count(*) total_call
        ,count(distinct contact_id) contacted_prospect
        ,sum(case when bcr.status in ('both-completed','called-completed') then 1 else 0 end) success_call
        ,sum(case when bcr.status in ('both-completed','called-completed') then duration else 0 end) success_call_time
    FROM bell.call_record bcr
    JOIN user.user uu ON bcr.user_id = uu.id
        AND uu.team_id = 74101849065537536
    WHERE user_id IN (92214216773079040, 92216577134432256, 98385799069859840, 98386104637489152, 104237409167769600)
        AND bcr.team_id = 74101849065537536
        AND bcr.create_time >= '%s'
        AND bcr.create_time < date_add('%s',interval 1 day)
        -- AND bcr.contact_type = 'prospect'
    GROUP BY user_id""" %(log_date,log_date)
    result = lucrativ.getAll(sql)
    print result
    print pd.DataFrame(list(result))
    return pd.DataFrame(list(result))

def merge_data(log_date):
    # 获取数据
    data = get_data(log_date)
    # 处理数据
    data['log_date'] = log_date
    data['average_call_time'] = data['success_call_time'] / data['success_call']
    # 插入数据
    insert_data(data, log_date)

def insert_data(data, log_date):
    obj = MyEngine("DP_STAT")
    sql = "delete from test_sdr_daily_action where log_date = '%s'"%(log_date)
    obj.delete(sql)
    print 'delete success'
    obj.insert('test_sdr_daily_action',data)
    print 'insert success'

if __name__ == '__main__':
    print "[" + sys.argv[0] + "] started at:", str(datetime.datetime.now())[0:19]
    log_date = ''
    if len(sys.argv) == 1:
        log_date = datetime.date.today() - datetime.timedelta(days=1)
        log_date = log_date.strftime('%Y-%m-%d')
    elif len(sys.argv) == 2:
        log_date = sys.argv[1]
    else:
        print 'wrong input parameter !!!'
        print "[" + sys.argv[0] + "] finished at:", str(datetime.datetime.now())[0:19]
        sys.exit()
    print "log_date: ",log_date
    merge_data(log_date)
    print "[" + sys.argv[0] + "] finished at:", str(datetime.datetime.now())[0:19]
View Code

 3、常用shell命令

创建文件夹
mkdir test

回主目录
cd
返回上级目录
cd ..
进入目录
cd testdir

编辑文本
vim test.txt
vim hello_world.py

下载文件
sz test.txt
上传文件
rz
View Code

4、基础SQL

show databases;

use bell;

show tables;

desc bell.call_record;

set time_zone='PST8PDT';

SELECT count(*) call_num
FROM bell.call_record
WHERE user_id = 92214216773079040
    AND team_id = 74101849065537536
    AND create_time >= '2019-04-11'
    AND create_time < '2019-04-12';
    
SELECT count(*) call_num
    ,count(DISTINCT contact_id) call_prospect
FROM bell.call_record
WHERE user_id = 92214216773079040
    AND team_id = 74101849065537536
    AND create_time >= '2019-04-11'
    AND create_time < '2019-04-12';
    
SELECT count(*) call_num
    ,count(DISTINCT contact_id) call_prospect
    ,sum(CASE WHEN status IN ('both-completed', 'called-completed') THEN 1 ELSE 0 END) success_call_num
FROM bell.call_record
WHERE user_id = 92214216773079040
    AND team_id = 74101849065537536
    AND create_time >= '2019-04-11'
    AND create_time < '2019-04-12';

SELECT user_id 
    ,count(*) call_num
FROM bell.call_record
WHERE user_id IN (92214216773079040, 92216577134432256, 98385799069859840, 98386104637489152, 104237409167769600)
    AND team_id = 74101849065537536
    AND create_time >= '2019-04-11'
    AND create_time < '2019-04-12'
GROUP BY user_id

SELECT user_id sales_id
    ,concat(uu.first_name,' ',uu.last_name) sales_name
    ,count(*) total_call
    ,count(distinct contact_id) contacted_prospect
    ,sum(case when bcr.status in ('both-completed','called-completed') then 1 else 0 end) success_call
    ,sum(case when bcr.status in ('both-completed','called-completed') then duration else 0 end) success_call_time
FROM bell.call_record bcr
JOIN user.user uu ON bcr.user_id = uu.id
    AND uu.team_id = 74101849065537536
WHERE user_id IN (92214216773079040, 92216577134432256, 98385799069859840, 98386104637489152, 104237409167769600)
    AND bcr.team_id = 74101849065537536
    AND bcr.create_time >= '2019-04-11'
    AND bcr.create_time < '2019-04-12'
    -- AND bcr.contact_type = 'prospect'
GROUP BY user_id
View Code

 


免责声明!

本站转载的文章为个人学习借鉴使用,本站对版权不负任何法律责任。如果侵犯了您的隐私权益,请联系本站邮箱yoyou2525@163.com删除。



 
粤ICP备18138465号  © 2018-2025 CODEPRJ.COM