python DataFrame 交并差集


smysqldb = mysql.MYSQL(host=source_param['db_ip'], port=int(source_param['db_port']),
                       user=source_param['db_user'],
                       pwd=source_param['db_pwd'], db=source_param['db_name'])
tmysqldb = mysql.MYSQL(host=target_param['db_ip'], port=int(target_param['db_port']),
                       user=target_param['db_user'],
                       pwd=target_param['db_pwd'], db=target_param['db_name'])
desc_sql = "SELECT c.COLUMN_NAME AS filed_name,c.COLUMN_TYPE AS filed_type,c.DATA_TYPE as data_type,c.CHARACTER_MAXIMUM_LENGTH as char_length FROM information_schema. TABLES t INNER JOIN information_schema. COLUMNS c ON t.TABLE_NAME = c.TABLE_NAME" \
           + " AND t.TABLE_SCHEMA = c.TABLE_SCHEMA WHERE t.TABLE_NAME = '{table_name}' AND t.TABLE_SCHEMA = '{database_name}'"
filed_columns = ['filed_name', 'filed_type', 'data_type', 'char_length']

df1 = pd.DataFrame(smysqldb.ExecQuery(
    desc_sql.format(table_name=source_param['table_name'], database_name=source_param['db_name'])),
    columns=filed_columns)
df2 = pd.DataFrame(tmysqldb.ExecQuery(
    desc_sql.format(table_name=target_param['table_name'], database_name=target_param['db_name'])),
    columns=filed_columns)

df1内容

 

 

 

df2内容

 

 可以看出df2比df1多两个字段 etl_date,real_pay_success_time

 1)innner、left join 、rigtht join、outer join

inner_df = pd.merge(df1, df2, how='inner')  ## 计算df1=df2的部份
print(inner_df)

left_df = pd.merge(df1, df2, how='left')  ## df1部分
print(left_df) #df1部分

 

right_df = pd.merge(df1, df2, how='right')  ## df2部分
print(right_df) #df2部分

 

 

outer_df = pd.merge(df1, df2, how='outer')  ## 取合集:df1和df2所有数据的集合
print(outer_df) #df2部分

 

 

 2)求差集

  df1-df2

df = pd.concat([df1, df2, df2]).drop_duplicates(subset=['filed_name', 'filed_type'], keep=False)#df1-df2
print(df)

 

 

 df2-df1

df = pd.concat([df2, df1, df1]).drop_duplicates(subset=['filed_name', 'filed_type'], keep=False)#df2-df1
print(df)

 

 

 上面的 df2-df1等同于

fileds_df = df1.append(df2).drop_duplicates(subset=['filed_name', 'filed_type'], keep=False) #drop_duplicates重复数据删除 
print(fileds_df)
fileds_df = df2.append(df1).drop_duplicates(subset=['filed_name', 'filed_type'], keep=False)
print(fileds_df)

 


免责声明!

本站转载的文章为个人学习借鉴使用,本站对版权不负任何法律责任。如果侵犯了您的隐私权益,请联系本站邮箱yoyou2525@163.com删除。



 
粤ICP备18138465号  © 2018-2025 CODEPRJ.COM