状况:在pyspark程序中发现写
import pandas as pd
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.types import *
import copy
spark = SparkSession.builder.appName("BR-54751").enableHiveSupport().getOrCreate()
df = spark.sql(
"""
select
screen_width,
screen_height,
count(distinct account_id) as people
from ff_facts.daily_active_account_facts
where local_dt between "20200511" and "20200515"
and system_platform = "iOS"
group by screen_height,screen_width
order by screen_width,screen_height
"""
)
data = df.toPandas()
# 设定手机宽是大于高(横向)
for i in range(data.shape[0]):
if data.iloc[i,0]<data.iloc[i,1]:
temp = data.iloc[i,0]
data.iloc[i,0] = data.iloc[i,1]
data.iloc[i,1] = temp
# 手机尺寸按照字段宽 从小到大排序
data = data.sort_values(by="screen_width",ascending=True)
data.reset_index(drop=True,inplace=True)
print("total=",sum(data["people"]))
最后一句报错,显示sum()函数有问题
原因:
我在import中 声明了from pyspark.sql.functions import *
程序中 sum()和pyspark中的sum()冲突了。
我默认sum()是python自带的函数,但是由于声明了pyspark中的函数,导致程序解析了pyspark中的sum()函数,造成了冲突
解决:
方式1-----直接删除 from pyspark.sql.functions import *
方式2 -----将sum(data["people"]) 改为 data["people"].sum()
