官网文档:https://iceberg.apache.org/spec/#manifests
一:了解iceberg 的文件结构
二:首先建立一个iceberg 表
CREATE TABLE sample7 (
`id` BIGINT,
`data` STRING,
`name` STRING,
`category` STRING)
USING iceberg
PARTITIONED BY (category, name)
TBLPROPERTIES( "format-version"="2")
插入3条数据(2个分区[xc1,pt2] [xc3,pt3])
spark-sql> select * from sample7;
22/08/09 20:17:18 WARN conf.HiveConf: HiveConf of name hive.merge.mapfile does not exist
22/08/09 20:17:18 WARN conf.HiveConf: HiveConf hive.mapjoin.smalltable.filesize expects LONG type value
1 1 xc1 pt2
2 2 xc1 pt2
2 2 xc3 pt3
三:如何查看iceberg 文件信息
1: show create table sample8;
获得信息如下:
CREATE TABLE spark_catalog.zdk.sample8 ( `id` BIGINT, `data` STRING, `name` STRING, `category` STRING) USING iceberg PARTITIONED BY (category) LOCATION 'hdfs://ns1/dtInsight/hive/warehouse/zdk.db/sample8' TBLPROPERTIES( 'current-snapshot-id' = '3329158953122539010', 'format' = 'iceberg/parquet')
2.根据属性信息中的 metadata_location 获取 metadata.json 的位置;
* metadata.json 包含所有的snapshot信息;
* 获取文件中的current-snapshot-id 的值:eg: 3329158953122539010
* 根据获取到的snapshot-id 在当前的文件中查找对应的snapshot信息:
{ "sequence-number" : 6, "snapshot-id" : 3329158953122539010, "parent-snapshot-id" : 6402031869467028731, "timestamp-ms" : 1660186257707, "summary" : { "operation" : "append", "spark.app.id" : "local-1660183319013", "added-data-files" : "1", "added-records" : "1", "added-files-size" : "1120", "changed-partition-count" : "1", "total-records" : "8", "total-files-size" : "7997", "total-data-files" : "7", "total-delete-files" : "0", "total-position-deletes" : "0", "total-equality-deletes" : "0" }, "manifest-list" : "hdfs://ns1/dtInsight/hive/warehouse/zdk.db/sample8/metadata/snap-3329158953122539010-1-f4d2b1bd-22d3-4957-a5fe-f84118663407.avro", "schema-id" : 0 }
* 获取manifest-list 的位置: hdfs://ns1/dtInsight/hive/warehouse/zdk.db/sample8/metadata/snap-3329158953122539010-1-f4d2b1bd-22d3-4957-a5fe-f84118663407.avro,
* 下载 avro 解析工具:avro-tools-1.9.2.jar
* 执行命令:java -jar avro-tools-1.9.2.jar tojson snap-3329158953122539010-1-f4d2b1bd-22d3-4957-a5fe-f84118663407.avro
{"manifest_path":"hdfs://ns1/dtInsight/hive/warehouse/zdk.db/sample8/metadata/9cfc5807-02d1-4cb4-a705-863982a58b05-m1.avro","manifest_length":7173,"partition_spec_id":1,"content":0,"sequence_number":5,"min_sequence_number":3,"added_snapshot_id":6402031869467028731,"added_data_files_count":0,"existing_data_files_count":2,"deleted_data_files_count":0,"added_rows_count":0,"existing_rows_count":2,"deleted_rows_count":0,"partitions":{"array":[{"contains_null":false,"contains_nan":{"boolean":false},"lower_bound":{"bytes":"pt3"},"upper_bound":{"bytes":"pt3"}},{"contains_null":false,"contains_nan":{"boolean":false},"lower_bound":{"bytes":"xc3"},"upper_bound":{"bytes":"xc3"}}]}} {"manifest_path":"hdfs://ns1/dtInsight/hive/warehouse/zdk.db/sample8/metadata/f4d2b1bd-22d3-4957-a5fe-f84118663407-m0.avro","manifest_length":6888,"partition_spec_id":0,"content":0,"sequence_number":6,"min_sequence_number":6,"added_snapshot_id":3329158953122539010,"added_data_files_count":1,"existing_data_files_count":0,"deleted_data_files_count":0,"added_rows_count":1,"existing_rows_count":0,"deleted_rows_count":0,"partitions":{"array":[{"contains_null":false,"contains_nan":{"boolean":false},"lower_bound":{"bytes":"pt3"},"upper_bound":{"bytes":"pt3"}}]}} {"manifest_path":"hdfs://ns1/dtInsight/hive/warehouse/zdk.db/sample8/metadata/9cfc5807-02d1-4cb4-a705-863982a58b05-m0.avro","manifest_length":6890,"partition_spec_id":0,"content":0,"sequence_number":5,"min_sequence_number":5,"added_snapshot_id":6402031869467028731,"added_data_files_count":1,"existing_data_files_count":0,"deleted_data_files_count":0,"added_rows_count":1,"existing_rows_count":0,"deleted_rows_count":0,"partitions":{"array":[{"contains_null":false,"contains_nan":{"boolean":false},"lower_bound":{"bytes":"pt3"},"upper_bound":{"bytes":"pt3"}}]}} {"manifest_path":"hdfs://ns1/dtInsight/hive/warehouse/zdk.db/sample8/metadata/a56dfef2-4592-4d1d-a642-e16f4467dac2-m1.avro","manifest_length":7020,"partition_spec_id":0,"content":0,"sequence_number":3,"min_sequence_number":1,"added_snapshot_id":7048362733565878395,"added_data_files_count":0,"existing_data_files_count":3,"deleted_data_files_count":0,"added_rows_count":0,"existing_rows_count":4,"deleted_rows_count":0,"partitions":{"array":[{"contains_null":false,"contains_nan":{"boolean":false},"lower_bound":{"bytes":"pt2"},"upper_bound":{"bytes":"pt3"}}]}}
当前的manifest_list包含4个manifest 文件,获取其中的一个manifest_path路径:hdfs://ns1/dtInsight/hive/warehouse/zdk.db/sample8/metadata/9cfc5807-02d1-4cb4-a705-863982a58b05-m1.avro
进行查看信息:
{"status":0,"snapshot_id":{"long":32144809447347390},"sequence_number":{"long":4},"data_file":{"content":0,"file_path":"hdfs://ns1/dtInsight/hive/warehouse/zdk.db/sample8/data/category=pt3/name=xc3/00000-0-9f003a4c-d5c0-4b56-9b8f-aab4f1d3a42d-00001.parquet","file_format":"PARQUET","partition":{"category":{"string":"pt3"},"name":{"string":"xc3"}},"record_count":1,"file_size_in_bytes":1120,"column_sizes":{"array":[{"key":1,"value":46},{"key":2,"value":48},{"key":3,"value":50},{"key":4,"value":50}]},"value_counts":{"array":[{"key":1,"value":1},{"key":2,"value":1},{"key":3,"value":1},{"key":4,"value":1}]},"null_value_counts":{"array":[{"key":1,"value":0},{"key":2,"value":0},{"key":3,"value":0},{"key":4,"value":0}]},"nan_value_counts":{"array":[]},"lower_bounds":{"array":[{"key":1,"value":"\u0005\u0000\u0000\u0000\u0000\u0000\u0000\u0000"},{"key":2,"value":"6"},{"key":3,"value":"xc3"},{"key":4,"value":"pt3"}]},"upper_bounds":{"array":[{"key":1,"value":"\u0005\u0000\u0000\u0000\u0000\u0000\u0000\u0000"},{"key":2,"value":"6"},{"key":3,"value":"xc3"},{"key":4,"value":"pt3"}]},"key_metadata":null,"split_offsets":{"array":[4]},"equality_ids":null,"sort_order_id":{"int":0}}} {"status":0,"snapshot_id":{"long":7048362733565878395},"sequence_number":{"long":3},"data_file":{"content":0,"file_path":"hdfs://ns1/dtInsight/hive/warehouse/zdk.db/sample8/data/category=pt3/name=xc3/00000-3-817aac21-1f4a-49ae-97aa-337b3c77a0f8-00001.parquet","file_format":"PARQUET","partition":{"category":{"string":"pt3"},"name":{"string":"xc3"}},"record_count":1,"file_size_in_bytes":1119,"column_sizes":{"array":[{"key":1,"value":45},{"key":2,"value":48},{"key":3,"value":50},{"key":4,"value":50}]},"value_counts":{"array":[{"key":1,"value":1},{"key":2,"value":1},{"key":3,"value":1},{"key":4,"value":1}]},"null_value_counts":{"array":[{"key":1,"value":0},{"key":2,"value":0},{"key":3,"value":0},{"key":4,"value":0}]},"nan_value_counts":{"array":[]},"lower_bounds":{"array":[{"key":1,"value":"\u0004\u0000\u0000\u0000\u0000\u0000\u0000\u0000"},{"key":2,"value":"5"},{"key":3,"value":"xc3"},{"key":4,"value":"pt3"}]},"upper_bounds":{"array":[{"key":1,"value":"\u0004\u0000\u0000\u0000\u0000\u0000\u0000\u0000"},{"key":2,"value":"5"},{"key":3,"value":"xc3"},{"key":4,"value":"pt3"}]},"key_metadata":null,"split_offsets":{"array":[4]},"equality_ids":null,"sort_order_id":{"int":0}}}
当前manifest文件包含了两个数据文件,数据文件的路径可以通过data_file获取;
hdfs://ns1/dtInsight/hive/warehouse/zdk.db/sample8/data/category=pt3/name=xc3/00000-0-9f003a4c-d5c0-4b56-9b8f-aab4f1d3a42d-00001.parquet
同时manifest 文件中还记录了对parquet 的列的统计信息
上面的列的统计信息lower_bound,upbounds 在iceberg 里面具体的作用在之后会进行分析;