一、數據內容
t20
├── data
│ ├── 00000-0-9c7ff22e-a767-4b85-91ec-a2771e54c209-00001.parquet
│ └── 00000-0-ecd3f21c-1bc0-4cdc-8917-d9a1afe7ce55-00001.parquet
└── metadata
├── 00000-d864e750-e5e2-4afd-bddb-2fab1e627a21.metadata.json
├── 00001-aabfd9a8-7dcd-4aa0-99aa-f6695f39bf6b.metadata.json
├── 00002-b5b7725f-7e86-454b-8d16-0e142bc84266.metadata.json
├── 0254b8b6-4d76-473c-86c2-97acda68d587-m0.avro
├── f787e035-8f7c-43a3-b264-42057bad2710-m0.avro
├── snap-6190364701448945732-1-0254b8b6-4d76-473c-86c2-97acda68d587.avro
└── snap-6460256963744122971-1-f787e035-8f7c-43a3-b264-42057bad2710.avro
二、文件詳解
data是數據,metadata是元數據
建表時會生成metadata/00000-xx.metadata.json
每做一次insert會生成元數據和數據,會生成新的00001-xx.metadata.json ..
1、數據
xxx.parquet
$ parquet head ~/t20/data/00000-0-ecd3f21c-1bc0-4cdc-8917-d9a1afe7ce55-00001.parquet
{"id": 20}
$ parquet head ~/t20/data/00000-0-9c7ff22e-a767-4b85-91ec-a2771e54c209-00001.parquet
{"id": 10}
2、元數據
(1)xxx.metadata.json
從hive metastore的mysql庫的TABLE_PARAMS可以查到表的metastore_location位置,即xxx.metadata.json,可以拿到當前表的快照 id(current-snapshot-id),以及這張表的所有快照信息,也就是 JSON 信息里面的 snapshots 數組對應的值
(2)清單列表(相當於snapshot):snap--xxx.avro
每個快照包含的一系列清單文件,每行中存儲了清單文件的路徑、清單文件里面存儲數據文件的分區范圍、增加了幾個數據文件、刪除了幾個數據文件等信息。這些信息可以用來在查詢時提供過濾
manifest_path
|
manifest_length
|
partition_spec_id
|
added_snapshot_id
|
added_data_files_count
|
existing_data_files_count
|
deleted_data_files_count
|
partitions
|
added_rows_count
|
existing_rows_count
|
deleted_rows_count
|
hdfs://rick-82lb:9000/user/hive2/warehouse/test.db/t20/metadata/0254b8b6-4d76-473c-86c2-97acda68d587-m0.avro
|
5514
|
0
|
6190364701448940000
|
1
|
0
|
0
|
[]
|
1
|
0
|
0
|
hdfs://rick-82lb:9000/user/hive2/warehouse/test.db/t20/metadata/0254b8b6-4d76-473c-86c2-97acda68d587-m0.avro
|
5514
|
0
|
6190364701448940000
|
1
|
0
|
0
|
[]
|
1
|
0
|
0
|
(3)清單:xxx.avro
每行都是每個數據文件的詳細描述,包括數據文件的狀態、文件路徑、分區信息、列級別的統計信息(比如每列的最大最小值、空值數等)、文件的大小以及文件里面數據的行數等信息。其中列級別的統計信息在 Scan 的時候可以為算子下推提供數據,以便可以過濾掉不必要的文件
{
"status": 1, "snapshot_id": {"long": 6460256963744123000}, "data_file": { "file_path": "hdfs://rick-82lb:9000/user/hive2/warehouse/test.db/t20/data/00000-0-ecd3f21c-1bc0-4cdc-8917-d9a1afe7ce55-00001.parquet", "file_format": "PARQUET", "partition": {}, "record_count": 1, "file_size_in_bytes": 387, "block_size_in_bytes": 67108864, "column_sizes": { "array": [{ "key": 1, "value": 51}] }, "value_counts": { "array": [{"key": 1,"value": 1}] }, "null_value_counts": { "array": [{"key": 1,"value": 0}] }, "nan_value_counts": {"array": []}, "lower_bounds": { "array": [{"key": 1,"value": "\u0014\u0000\u0000\u0000"}] }, "upper_bounds": { "array": [{"key": 1,"value": "\u0014\u0000\u0000\u0000"}] }, "key_metadata": null, "split_offsets": { "array": [4] } } }
以下是完整的metadata目錄下的文件內容,有興趣的可以再深究
metadata/00001-aabfd9a8-7dcd-4aa0-99aa-f6695f39bf6b.metadata.json
{ "format-version" : 1, "table-uuid" : "900edf11-3434-408d-a789-a6a5acecdca3", "location" : "hdfs://rick-82lb:9000/user/hive2/warehouse/test.db/t20", "last-updated-ms" : 1619022202031, "last-column-id" : 1, "schema" : { "type" : "struct", "fields" : [ { "id" : 1, "name" : "id", "required" : false, "type" : "int" } ] }, "partition-spec" : [ ], "default-spec-id" : 0, "partition-specs" : [ { "spec-id" : 0, "fields" : [ ] } ], "default-sort-order-id" : 0, "sort-orders" : [ { "order-id" : 0, "fields" : [ ] } ], "properties" : { }, "current-snapshot-id" : 6190364701448945732, "snapshots" : [ { "snapshot-id" : 6190364701448945732, "timestamp-ms" : 1619022202031, "summary" : { "operation" : "append", "flink.job-id" : "93d92dedbddaf202ac2a2beb9d381084", "flink.max-committed-checkpoint-id" : "9223372036854775807", "added-data-files" : "1", "added-records" : "1", "added-files-size" : "387", "changed-partition-count" : "1", "total-records" : "1", "total-data-files" : "1", "total-delete-files" : "0", "total-position-deletes" : "0", "total-equality-deletes" : "0" }, "manifest-list" : "hdfs://rick-82lb:9000/user/hive2/warehouse/test.db/t20/metadata/snap-6190364701448945732-1-0254b8b6-4d76-473c-86c2-97acda68d587.avro" } ], "snapshot-log" : [ { "timestamp-ms" : 1619022202031, "snapshot-id" : 6190364701448945732 } ], "metadata-log" : [ { "timestamp-ms" : 1619020518215, "metadata-file" : "hdfs://rick-82lb:9000/user/hive2/warehouse/test.db/t20/metadata/00000-d864e750-e5e2-4afd-bddb-2fab1e627a21.metadata.json" } ] }
metadata/00002-b5b7725f-7e86-454b-8d16-0e142bc84266.metadata.json
{ "format-version" : 1, "table-uuid" : "900edf11-3434-408d-a789-a6a5acecdca3", "location" : "hdfs://rick-82lb:9000/user/hive2/warehouse/test.db/t20", "last-updated-ms" : 1619023435305, "last-column-id" : 1, "schema" : { "type" : "struct", "fields" : [ { "id" : 1, "name" : "id", "required" : false, "type" : "int" } ] }, "partition-spec" : [ ], "default-spec-id" : 0, "partition-specs" : [ { "spec-id" : 0, "fields" : [ ] } ], "default-sort-order-id" : 0, "sort-orders" : [ { "order-id" : 0, "fields" : [ ] } ], "properties" : { }, "current-snapshot-id" : 6460256963744122971, "snapshots" : [ { "snapshot-id" : 6190364701448945732, "timestamp-ms" : 1619022202031, "summary" : { "operation" : "append", "flink.job-id" : "93d92dedbddaf202ac2a2beb9d381084", "flink.max-committed-checkpoint-id" : "9223372036854775807", "added-data-files" : "1", "added-records" : "1", "added-files-size" : "387", "changed-partition-count" : "1", "total-records" : "1", "total-data-files" : "1", "total-delete-files" : "0", "total-position-deletes" : "0", "total-equality-deletes" : "0" }, "manifest-list" : "hdfs://rick-82lb:9000/user/hive2/warehouse/test.db/t20/metadata/snap-6190364701448945732-1-0254b8b6-4d76-473c-86c2-97acda68d587.avro" }, { "snapshot-id" : 6460256963744122971, "parent-snapshot-id" : 6190364701448945732, "timestamp-ms" : 1619023435305, "summary" : { "operation" : "append", "flink.job-id" : "3be57424a6547f41f1df350f9667ae65", "flink.max-committed-checkpoint-id" : "9223372036854775807", "added-data-files" : "1", "added-records" : "1", "added-files-size" : "387", "changed-partition-count" : "1", "total-records" : "2", "total-data-files" : "2", "total-delete-files" : "0", "total-position-deletes" : "0", "total-equality-deletes" : "0" }, "manifest-list" : "hdfs://rick-82lb:9000/user/hive2/warehouse/test.db/t20/metadata/snap-6460256963744122971-1-f787e035-8f7c-43a3-b264-42057bad2710.avro" } ], "snapshot-log" : [ { "timestamp-ms" : 1619022202031, "snapshot-id" : 6190364701448945732 }, { "timestamp-ms" : 1619023435305, "snapshot-id" : 6460256963744122971 } ], "metadata-log" : [ { "timestamp-ms" : 1619020518215, "metadata-file" : "hdfs://rick-82lb:9000/user/hive2/warehouse/test.db/t20/metadata/00000-d864e750-e5e2-4afd-bddb-2fab1e627a21.metadata.json" }, { "timestamp-ms" : 1619022202031, "metadata-file" : "hdfs://rick-82lb:9000/user/hive2/warehouse/test.db/t20/metadata/00001-aabfd9a8-7dcd-4aa0-99aa-f6695f39bf6b.metadata.json" } ] }
metadata/00000-d864e750-e5e2-4afd-bddb-2fab1e627a21.metadata.json
{ "format-version" : 1, "table-uuid" : "900edf11-3434-408d-a789-a6a5acecdca3", "location" : "hdfs://rick-82lb:9000/user/hive2/warehouse/test.db/t20", "last-updated-ms" : 1619020518215, "last-column-id" : 1, "schema" : { "type" : "struct", "schema-id" : 0, "fields" : [ { "id" : 1, "name" : "id", "required" : false, "type" : "int" } ] }, "current-schema-id" : 0, "schemas" : [ { "type" : "struct", "schema-id" : 0, "fields" : [ { "id" : 1, "name" : "id", "required" : false, "type" : "int" } ] } ], "partition-spec" : [ ], "default-spec-id" : 0, "partition-specs" : [ { "spec-id" : 0, "fields" : [ ] } ], "last-partition-id" : 999, "default-sort-order-id" : 0, "sort-orders" : [ { "order-id" : 0, "fields" : [ ] } ], "properties" : { }, "current-snapshot-id" : -1, "snapshots" : [ ], "snapshot-log" : [ ], "metadata-log" : [ ] }
metadata/0254b8b6-4d76-473c-86c2-97acda68d587-m0.avro
{ "status": 1, "snapshot_id": { "long": 6190364701448946000 }, "data_file": { "file_path": "hdfs://rick-82lb:9000/user/hive2/warehouse/test.db/t20/data/00000-0-9c7ff22e-a767-4b85-91ec-a2771e54c209-00001.parquet", "file_format": "PARQUET", "partition": {}, "record_count": 1, "file_size_in_bytes": 387, "block_size_in_bytes": 67108864, "column_sizes": { "array": [ { "key": 1, "value": 51 } ] }, "value_counts": { "array": [ { "key": 1, "value": 1 } ] }, "null_value_counts": { "array": [ { "key": 1, "value": 0 } ] }, "nan_value_counts": { "array": [] }, "lower_bounds": { "array": [ { "key": 1, "value": "\n\u0000\u0000\u0000" } ] }, "upper_bounds": { "array": [ { "key": 1, "value": "\n\u0000\u0000\u0000" } ] }, "key_metadata": null, "split_offsets": { "array": [ 4 ] } } }
metadata/f787e035-8f7c-43a3-b264-42057bad2710-m0.avro
{ "status": 1, "snapshot_id": { "long": 6460256963744123000 }, "data_file": { "file_path": "hdfs://rick-82lb:9000/user/hive2/warehouse/test.db/t20/data/00000-0-ecd3f21c-1bc0-4cdc-8917-d9a1afe7ce55-00001.parquet", "file_format": "PARQUET", "partition": {}, "record_count": 1, "file_size_in_bytes": 387, "block_size_in_bytes": 67108864, "column_sizes": { "array": [ { "key": 1, "value": 51 } ] }, "value_counts": { "array": [ { "key": 1, "value": 1 } ] }, "null_value_counts": { "array": [ { "key": 1, "value": 0 } ] }, "nan_value_counts": { "array": [] }, "lower_bounds": { "array": [ { "key": 1, "value": "\u0014\u0000\u0000\u0000" } ] }, "upper_bounds": { "array": [ { "key": 1, "value": "\u0014\u0000\u0000\u0000" } ] }, "key_metadata": null, "split_offsets": { "array": [ 4 ] } } }
metadata/snap-6190364701448945732-1-0254b8b6-4d76-473c-86c2-97acda68d587.avro
{ "manifest_path": "hdfs://rick-82lb:9000/user/hive2/warehouse/test.db/t20/metadata/0254b8b6-4d76-473c-86c2-97acda68d587-m0.avro", "manifest_length": 5514, "partition_spec_id": 0, "added_snapshot_id": { "long": 6190364701448946000 }, "added_data_files_count": { "int": 1 }, "existing_data_files_count": { "int": 0 }, "deleted_data_files_count": { "int": 0 }, "partitions": { "array": [] }, "added_rows_count": { "long": 1 }, "existing_rows_count": { "long": 0 }, "deleted_rows_count": { "long": 0 } }
metadata/snap-6460256963744122971-1-f787e035-8f7c-43a3-b264-42057bad2710.avro
{ "manifest_path": "hdfs://rick-82lb:9000/user/hive2/warehouse/test.db/t20/metadata/f787e035-8f7c-43a3-b264-42057bad2710-m0.avro", "manifest_length": 5514, "partition_spec_id": 0, "added_snapshot_id": { "long": 6460256963744123000 }, "added_data_files_count": { "int": 1 }, "existing_data_files_count": { "int": 0 }, "deleted_data_files_count": { "int": 0 }, "partitions": { "array": [] }, "added_rows_count": { "long": 1 }, "existing_rows_count": { "long": 0 }, "deleted_rows_count": { "long": 0 } } { "manifest_path": "hdfs://rick-82lb:9000/user/hive2/warehouse/test.db/t20/metadata/0254b8b6-4d76-473c-86c2-97acda68d587-m0.avro", "manifest_length": 5514, "partition_spec_id": 0, "added_snapshot_id": { "long": 6190364701448946000 }, "added_data_files_count": { "int": 1 }, "existing_data_files_count": { "int": 0 }, "deleted_data_files_count": { "int": 0 }, "partitions": { "array": [] }, "added_rows_count": { "long": 1 }, "existing_rows_count": { "long": 0 }, "deleted_rows_count": { "long": 0 } }