import com.twq.dataset.Utils._
import org.apache.spark.sql.{SaveMode, SparkSession}
object FilePartitionTest {
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder()
.appName("FilePartitionTest")
.getOrCreate()
val sessions = spark.read.parquet(s"${BASE_PATH}/trackerSession")
sessions.show()
sessions.printSchema()
sessions.createOrReplaceTempView("non_partition_table")
spark.sql("select * from non_partition_table where day = 20170903").show()
//對數據按照年月日進行分區
sessions.write.mode(SaveMode.Overwrite).partitionBy("cookie").parquet(s"${BASE_PATH}/trackerSession_partition")
val partitionDF = spark.read.parquet(s"${BASE_PATH}/trackerSession_partition")
partitionDF.show()
partitionDF.printSchema()
//用sql查詢某20170903這天的數據
partitionDF.createOrReplaceTempView("partition_table")
spark.sql("select * from partition_table where cookie='cookie1'").show()
//取20170903這天的數據
val day03DF = spark.read.parquet(s"${BASE_PATH}/trackerSession_partition/year=2017/month=201709/day=20170903")
day03DF.show()
day03DF.printSchema()
//bucket只能用於hive表中
//而且只用於parquet、json和orc文件格式的文件數據
sessions.write
.partitionBy("year")
.bucketBy(24, "cookie")
.saveAsTable("session")
spark.stop()
}
}