以下为一个hive-catalog的iceberg表的所有存在hdfs目录中的文件
包含
1.parquet数据文件
2.json元数据文件
3.avro snapshot文件
4.avro manifest文件
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/data/00000-0-319a206d-7ead-415d-9ec8-700c1a49b8c4-00001.parquet
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/data/00000-0-319a206d-7ead-415d-9ec8-700c1a49b8c4-00003.parquet
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/data/00000-0-319a206d-7ead-415d-9ec8-700c1a49b8c4-00004.parquet
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/data/00000-0-319a206d-7ead-415d-9ec8-700c1a49b8c4-00005.parquet
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/data/00000-0-319a206d-7ead-415d-9ec8-700c1a49b8c4-00006.parquet
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/data/00000-0-319a206d-7ead-415d-9ec8-700c1a49b8c4-00007.parquet
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/data/00000-0-319a206d-7ead-415d-9ec8-700c1a49b8c4-00008.parquet
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/data/00000-0-319a206d-7ead-415d-9ec8-700c1a49b8c4-00009.parquet
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/data/00000-0-319a206d-7ead-415d-9ec8-700c1a49b8c4-00010.parquet
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/data/00000-0-319a206d-7ead-415d-9ec8-700c1a49b8c4-00011.parquet
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/data/00000-0-319a206d-7ead-415d-9ec8-700c1a49b8c4-00012.parquet
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/data/00000-0-79d89118-5069-4877-8332-2a592c887fe3-00001.parquet
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/metadata/00000-f9a42593-ab76-4933-a739-8e10b476fc85.metadata.json
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/metadata/00001-2002be31-0182-4085-9173-aee3e4facc0b.metadata.json
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/metadata/00002-2c5e9702-a908-43a6-bbe8-0f0c6582e984.metadata.json
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/metadata/00003-3db39d6b-6311-4bdb-9d7b-b56f2df74fb3.metadata.json
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/metadata/00004-a5490f98-4daf-4592-abf1-fdcc408f1b0f.metadata.json
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/metadata/00005-b13e2c1f-1383-43c3-a53c-832ed8c68fa8.metadata.json
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/metadata/00006-68ce5b89-27fb-421a-8a49-42f383dfc587.metadata.json
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/metadata/00007-b3430d66-c9fb-401c-b800-e2ea4ad70d8d.metadata.json
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/metadata/09769592-109f-4f6e-ab46-9b597dacfd43-m0.avro
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/metadata/1a49a079-d7cf-41a6-931d-15ad2a44914b-m0.avro
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/metadata/1a49a079-d7cf-41a6-931d-15ad2a44914b-m1.avro
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/metadata/2b1ddf19-5701-4c0b-ac6a-ea41fdab9c07-m0.avro
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/metadata/2b1ddf19-5701-4c0b-ac6a-ea41fdab9c07-m1.avro
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/metadata/bf413511-d1cf-407f-bcc9-b6960cde7898-m0.avro
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/metadata/bf413511-d1cf-407f-bcc9-b6960cde7898-m1.avro
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/metadata/e97d1919-f47d-40c0-9eb6-24bf68f96980-m0.avro
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/metadata/e97d1919-f47d-40c0-9eb6-24bf68f96980-m1.avro
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/metadata/f0bd795c-6a10-41bc-8f79-437fef1ff5f9-m0.avro
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/metadata/f0bd795c-6a10-41bc-8f79-437fef1ff5f9-m1.avro
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/metadata/f0bd795c-6a10-41bc-8f79-437fef1ff5f9-m2.avro
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/metadata/f0bd795c-6a10-41bc-8f79-437fef1ff5f9-m3.avro
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/metadata/f0bd795c-6a10-41bc-8f79-437fef1ff5f9-m4.avro
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/metadata/f0bd795c-6a10-41bc-8f79-437fef1ff5f9-m5.avro
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/metadata/f0bd795c-6a10-41bc-8f79-437fef1ff5f9-m6.avro
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/metadata/f0bd795c-6a10-41bc-8f79-437fef1ff5f9-m7.avro
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/metadata/snap-1289984099921389549-1-1a49a079-d7cf-41a6-931d-15ad2a44914b.avro
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/metadata/snap-3921229567852426700-1-bf413511-d1cf-407f-bcc9-b6960cde7898.avro
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/metadata/snap-5386042144404510937-1-09769592-109f-4f6e-ab46-9b597dacfd43.avro
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/metadata/snap-7125662397327732785-1-2b1ddf19-5701-4c0b-ac6a-ea41fdab9c07.avro
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/metadata/snap-7329471080018208648-1-f0bd795c-6a10-41bc-8f79-437fef1ff5f9.avro
hdfs://10.177.13.120:8020/user/hive/dc-warehouse/iceberg_cdc_table/metadata/snap-7377732782289998100-1-e97d1919-f47d-40c0-9eb6-24bf68f96980.avro
以下为iceberg表在hive中的建表语句
REATE EXTERNAL TABLE iceberg_cdc_table
(
id
string COMMENT 'unique ID',
data
string)
ROW FORMAT SERDE
'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.FileInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.mapred.FileOutputFormat'
LOCATION
'hdfs://test-hdfs1/user/hive/dc-warehouse/iceberg_cdc_table'
TBLPROPERTIES (
'COLUMN_STATS_ACCURATE'='false',
'metadata_location'='hdfs://test-hdfs1/user/hive/dc-warehouse/iceberg_cdc_table/metadata/00007-b3430d66-c9fb-401c-b800-e2ea4ad70d8d.metadata.json',
'numFiles'='0',
'numRows'='-1',
'previous_metadata_location'='hdfs://test-hdfs1/user/hive/dc-warehouse/iceberg_cdc_table/metadata/00006-68ce5b89-27fb-421a-8a49-42f383dfc587.metadata.json',
'rawDataSize'='-1',
'table_type'='ICEBERG',
'totalSize'='0',
'transient_lastDdlTime'='1619089695')
其中metadata_location为当前的元数据文件,查看该文件
{
"format-version" : 2,
"table-uuid" : "924ae1db-5aad-451a-ae3b-bd933296ea84",
"location" : "hdfs://test-hdfs1/user/hive/dc-warehouse/iceberg_cdc_table",
"last-sequence-number" : 6,
"last-updated-ms" : 1619090084800,
"last-column-id" : 2,
"current-schema-id" : 0,
"schemas" : [ {
"type" : "struct",
"schema-id" : 0,
"fields" : [ {
"id" : 1,
"name" : "id",
"required" : true,
"type" : "string",
"doc" : "unique ID"
}, {
"id" : 2,
"name" : "data",
"required" : true,
"type" : "string"
} ]
} ],
"default-spec-id" : 0,
"partition-specs" : [ {
"spec-id" : 0,
"fields" : [ ]
} ],
"last-partition-id" : 999,
"default-sort-order-id" : 0,
"sort-orders" : [ {
"order-id" : 0,
"fields" : [ ]
} ],
"row-key" : {
"identifier-fields" : [ {
"source-id" : 1
} ]
},
"properties" : { },
"current-snapshot-id" : 7329471080018208648,
"snapshots" : [ {
"sequence-number" : 1,
"snapshot-id" : 5386042144404510937,
"timestamp-ms" : 1619089843403,
"summary" : {
"operation" : "append",
"flink.job-id" : "94aed63193990d73442f8696c3eee136",
"flink.max-committed-checkpoint-id" : "1",
"added-data-files" : "1",
"added-records" : "1000000",
"added-files-size" : "3076138",
"changed-partition-count" : "1",
"total-records" : "1000000",
"total-files-size" : "3076138",
"total-data-files" : "1",
"total-delete-files" : "0",
"total-position-deletes" : "0",
"total-equality-deletes" : "0"
},
"manifest-list" : "hdfs://test-hdfs1/user/hive/dc-warehouse/iceberg_cdc_table/metadata/snap-5386042144404510937-1-09769592-109f-4f6e-ab46-9b597dacfd43.avro"
}, {
"sequence-number" : 2,
"snapshot-id" : 1289984099921389549,
"parent-snapshot-id" : 5386042144404510937,
"timestamp-ms" : 1619089902186,
"summary" : {
"operation" : "overwrite",
"flink.job-id" : "94aed63193990d73442f8696c3eee136",
"flink.max-committed-checkpoint-id" : "2",
"added-data-files" : "1",
"added-delete-files" : "1",
"added-records" : "21892",
"added-files-size" : "184249",
"added-equality-deletes" : "21892",
"changed-partition-count" : "1",
"total-records" : "1021892",
"total-files-size" : "3260387",
"total-data-files" : "2",
"total-delete-files" : "1",
"total-position-deletes" : "0",
"total-equality-deletes" : "21892"
},
"manifest-list" : "hdfs://test-hdfs1/user/hive/dc-warehouse/iceberg_cdc_table/metadata/snap-1289984099921389549-1-1a49a079-d7cf-41a6-931d-15ad2a44914b.avro"
}, {
"sequence-number" : 3,
"snapshot-id" : 7377732782289998100,
"parent-snapshot-id" : 1289984099921389549,
"timestamp-ms" : 1619089962201,
"summary" : {
"operation" : "overwrite",
"flink.job-id" : "94aed63193990d73442f8696c3eee136",
"flink.max-committed-checkpoint-id" : "3",
"added-data-files" : "1",
"added-delete-files" : "1",
"added-records" : "73302",
"added-files-size" : "604308",
"added-equality-deletes" : "73302",
"changed-partition-count" : "1",
"total-records" : "1095194",
"total-files-size" : "3864695",
"total-data-files" : "3",
"total-delete-files" : "2",
"total-position-deletes" : "0",
"total-equality-deletes" : "95194"
},
"manifest-list" : "hdfs://test-hdfs1/user/hive/dc-warehouse/iceberg_cdc_table/metadata/snap-7377732782289998100-1-e97d1919-f47d-40c0-9eb6-24bf68f96980.avro"
}, {
"sequence-number" : 4,
"snapshot-id" : 3921229567852426700,
"parent-snapshot-id" : 7377732782289998100,
"timestamp-ms" : 1619090021768,
"summary" : {
"operation" : "overwrite",
"flink.job-id" : "94aed63193990d73442f8696c3eee136",
"flink.max-committed-checkpoint-id" : "4",
"added-data-files" : "1",
"added-delete-files" : "1",
"added-records" : "95137",
"added-files-size" : "783498",
"added-equality-deletes" : "95137",
"changed-partition-count" : "1",
"total-records" : "1190331",
"total-files-size" : "4648193",
"total-data-files" : "4",
"total-delete-files" : "3",
"total-position-deletes" : "0",
"total-equality-deletes" : "190331"
},
"manifest-list" : "hdfs://test-hdfs1/user/hive/dc-warehouse/iceberg_cdc_table/metadata/snap-3921229567852426700-1-bf413511-d1cf-407f-bcc9-b6960cde7898.avro"
}, {
"sequence-number" : 5,
"snapshot-id" : 7125662397327732785,
"parent-snapshot-id" : 3921229567852426700,
"timestamp-ms" : 1619090082142,
"summary" : {
"operation" : "overwrite",
"flink.job-id" : "94aed63193990d73442f8696c3eee136",
"flink.max-committed-checkpoint-id" : "5",
"added-data-files" : "1",
"added-delete-files" : "1",
"added-records" : "2772",
"added-files-size" : "25696",
"added-equality-deletes" : "2772",
"changed-partition-count" : "1",
"total-records" : "1193103",
"total-files-size" : "4673889",
"total-data-files" : "5",
"total-delete-files" : "4",
"total-position-deletes" : "0",
"total-equality-deletes" : "193103"
},
"manifest-list" : "hdfs://test-hdfs1/user/hive/dc-warehouse/iceberg_cdc_table/metadata/snap-7125662397327732785-1-2b1ddf19-5701-4c0b-ac6a-ea41fdab9c07.avro"
}, {
"sequence-number" : 6,
"snapshot-id" : 7329471080018208648,
"parent-snapshot-id" : 7125662397327732785,
"timestamp-ms" : 1619090084800,
"summary" : {
"operation" : "replace",
"added-data-files" : "1",
"deleted-data-files" : "4",
"removed-delete-files" : "3",
"added-records" : "1000000",
"deleted-records" : "1190331",
"added-files-size" : "3293597",
"removed-files-size" : "4648193",
"removed-equality-deletes" : "190331",
"changed-partition-count" : "1",
"total-records" : "1002772",
"total-files-size" : "3319293",
"total-data-files" : "2",
"total-delete-files" : "1",
"total-position-deletes" : "0",
"total-equality-deletes" : "2772"
},
"manifest-list" : "hdfs://test-hdfs1/user/hive/dc-warehouse/iceberg_cdc_table/metadata/snap-7329471080018208648-1-f0bd795c-6a10-41bc-8f79-437fef1ff5f9.avro"
} ],
"snapshot-log" : [ {
"timestamp-ms" : 1619089843403,
"snapshot-id" : 5386042144404510937
}, {
"timestamp-ms" : 1619089902186,
"snapshot-id" : 1289984099921389549
}, {
"timestamp-ms" : 1619089962201,
"snapshot-id" : 7377732782289998100
}, {
"timestamp-ms" : 1619090021768,
"snapshot-id" : 3921229567852426700
}, {
"timestamp-ms" : 1619090082142,
"snapshot-id" : 7125662397327732785
}, {
"timestamp-ms" : 1619090084800,
"snapshot-id" : 7329471080018208648
} ],
"metadata-log" : [ {
"timestamp-ms" : 1619089691387,
"metadata-file" : "hdfs://test-hdfs1/user/hive/dc-warehouse/iceberg_cdc_table/metadata/00000-f9a42593-ab76-4933-a739-8e10b476fc85.metadata.json"
}, {
"timestamp-ms" : 1619089741748,
"metadata-file" : "hdfs://test-hdfs1/user/hive/dc-warehouse/iceberg_cdc_table/metadata/00001-2002be31-0182-4085-9173-aee3e4facc0b.metadata.json"
}, {
"timestamp-ms" : 1619089843403,
"metadata-file" : "hdfs://test-hdfs1/user/hive/dc-warehouse/iceberg_cdc_table/metadata/00002-2c5e9702-a908-43a6-bbe8-0f0c6582e984.metadata.json"
}, {
"timestamp-ms" : 1619089902186,
"metadata-file" : "hdfs://test-hdfs1/user/hive/dc-warehouse/iceberg_cdc_table/metadata/00003-3db39d6b-6311-4bdb-9d7b-b56f2df74fb3.metadata.json"
}, {
"timestamp-ms" : 1619089962201,
"metadata-file" : "hdfs://test-hdfs1/user/hive/dc-warehouse/iceberg_cdc_table/metadata/00004-a5490f98-4daf-4592-abf1-fdcc408f1b0f.metadata.json"
}, {
"timestamp-ms" : 1619090021768,
"metadata-file" : "hdfs://test-hdfs1/user/hive/dc-warehouse/iceberg_cdc_table/metadata/00005-b13e2c1f-1383-43c3-a53c-832ed8c68fa8.metadata.json"
}, {
"timestamp-ms" : 1619090082142,
"metadata-file" : "hdfs://test-hdfs1/user/hive/dc-warehouse/iceberg_cdc_table/metadata/00006-68ce5b89-27fb-421a-8a49-42f383dfc587.metadata.json"
} ]
}
其中包含了所有的snapshot信息和所有的元数据文件信息
注意sequence-number和snapshot-id,它们是强关联的,
sequence-number在v2版本的表中会作为标识数据的序列号
读取的时候data文件中过滤掉equility-delete数据的时候是按sequence-number过滤的
就找比data文件snapshot大的equility-delete文件
小文件合并也和入数据checkpoint一样生成新的snapshot
如果入库snapshot是3 然后开始小文件合并 合并过程中入库生成snapshot 4
然后合并完成生成snapshot 5
snapshot5的文件只合并了snapshot3的文件需要对snapshot 4中的equility-delete文件进行过滤 但是因为5比4大就不会过滤了
小文件合并跨了入库的snapshot数据就有问题了
当前的snapshotID和对应的文件,查看该文件snap-7329471080018208648-1-f0bd795c-6a10-41bc-8f79-437fef1ff5f9.avro
{
"manifest_path" : "hdfs://test-hdfs1/user/hive/dc-warehouse/iceberg_cdc_table/metadata/f0bd795c-6a10-41bc-8f79-437fef1ff5f9-m7.avro",
"manifest_length" : 6569,
"partition_spec_id" : 0,
"content" : 0,
"sequence_number" : 6,
"min_sequence_number" : 6,
"added_snapshot_id" : 7329471080018208648,
"added_data_files_count" : 1,
"existing_data_files_count" : 0,
"deleted_data_files_count" : 0,
"added_rows_count" : 1000000,
"existing_rows_count" : 0,
"deleted_rows_count" : 0,
"partitions" : {
"array" : [ ]
}
00000-0-79d89118-5069-4877-8332-2a592c887fe3-00001.parquet "status" : 1 "content" : 0
}
{
"manifest_path" : "hdfs://test-hdfs1/user/hive/dc-warehouse/iceberg_cdc_table/metadata/2b1ddf19-5701-4c0b-ac6a-ea41fdab9c07-m0.avro",
"manifest_length" : 6557,
"partition_spec_id" : 0,
"content" : 0,
"sequence_number" : 5,
"min_sequence_number" : 5,
"added_snapshot_id" : 7125662397327732785,
"added_data_files_count" : 1,
"existing_data_files_count" : 0,
"deleted_data_files_count" : 0,
"added_rows_count" : 2772,
"existing_rows_count" : 0,
"deleted_rows_count" : 0,
"partitions" : {
"array" : [ ]
}
00000-0-319a206d-7ead-415d-9ec8-700c1a49b8c4-00009.parquet "status" : 1 "content" : 0
}
{
"manifest_path" : "hdfs://test-hdfs1/user/hive/dc-warehouse/iceberg_cdc_table/metadata/f0bd795c-6a10-41bc-8f79-437fef1ff5f9-m0.avro",
"manifest_length" : 6553,
"partition_spec_id" : 0,
"content" : 0,
"sequence_number" : 6,
"min_sequence_number" : 6,
"added_snapshot_id" : 7329471080018208648,
"added_data_files_count" : 0,
"existing_data_files_count" : 0,
"deleted_data_files_count" : 1,
"added_rows_count" : 0,
"existing_rows_count" : 0,
"deleted_rows_count" : 95137,
"partitions" : {
"array" : [ ]
}
00000-0-319a206d-7ead-415d-9ec8-700c1a49b8c4-00007.parquet "status" : 2 "content" : 0
}
{
"manifest_path" : "hdfs://test-hdfs1/user/hive/dc-warehouse/iceberg_cdc_table/metadata/f0bd795c-6a10-41bc-8f79-437fef1ff5f9-m3.avro",
"manifest_length" : 6554,
"partition_spec_id" : 0,
"content" : 0,
"sequence_number" : 6,
"min_sequence_number" : 6,
"added_snapshot_id" : 7329471080018208648,
"added_data_files_count" : 0,
"existing_data_files_count" : 0,
"deleted_data_files_count" : 1,
"added_rows_count" : 0,
"existing_rows_count" : 0,
"deleted_rows_count" : 73302,
"partitions" : {
"array" : [ ]
}
00000-0-319a206d-7ead-415d-9ec8-700c1a49b8c4-00005.parquet "status" : 2 "content" : 0
}
{
"manifest_path" : "hdfs://test-hdfs1/user/hive/dc-warehouse/iceberg_cdc_table/metadata/f0bd795c-6a10-41bc-8f79-437fef1ff5f9-m2.avro",
"manifest_length" : 6553,
"partition_spec_id" : 0,
"content" : 0,
"sequence_number" : 6,
"min_sequence_number" : 6,
"added_snapshot_id" : 7329471080018208648,
"added_data_files_count" : 0,
"existing_data_files_count" : 0,
"deleted_data_files_count" : 1,
"added_rows_count" : 0,
"existing_rows_count" : 0,
"deleted_rows_count" : 21892,
"partitions" : {
"array" : [ ]
}
00000-0-319a206d-7ead-415d-9ec8-700c1a49b8c4-00003.parquet "status" : 2 "content" : 0
}
{
"manifest_path" : "hdfs://test-hdfs1/user/hive/dc-warehouse/iceberg_cdc_table/metadata/f0bd795c-6a10-41bc-8f79-437fef1ff5f9-m1.avro",
"manifest_length" : 6566,
"partition_spec_id" : 0,
"content" : 0,
"sequence_number" : 6,
"min_sequence_number" : 6,
"added_snapshot_id" : 7329471080018208648,
"added_data_files_count" : 0,
"existing_data_files_count" : 0,
"deleted_data_files_count" : 1,
"added_rows_count" : 0,
"existing_rows_count" : 0,
"deleted_rows_count" : 1000000,
"partitions" : {
"array" : [ ]
}
00000-0-319a206d-7ead-415d-9ec8-700c1a49b8c4-00001.parquet "status" : 2 "content" : 0
}
{
"manifest_path" : "hdfs://test-hdfs1/user/hive/dc-warehouse/iceberg_cdc_table/metadata/2b1ddf19-5701-4c0b-ac6a-ea41fdab9c07-m1.avro",
"manifest_length" : 6568,
"partition_spec_id" : 0,
"content" : 1,
"sequence_number" : 5,
"min_sequence_number" : 5,
"added_snapshot_id" : 7125662397327732785,
"added_data_files_count" : 1,
"existing_data_files_count" : 0,
"deleted_data_files_count" : 0,
"added_rows_count" : 2772,
"existing_rows_count" : 0,
"deleted_rows_count" : 0,
"partitions" : {
"array" : [ ]
}
00000-0-319a206d-7ead-415d-9ec8-700c1a49b8c4-00010.parquet "status" : 1 "content" : 2
}
{
"manifest_path" : "hdfs://test-hdfs1/user/hive/dc-warehouse/iceberg_cdc_table/metadata/f0bd795c-6a10-41bc-8f79-437fef1ff5f9-m4.avro",
"manifest_length" : 6568,
"partition_spec_id" : 0,
"content" : 1,
"sequence_number" : 6,
"min_sequence_number" : 6,
"added_snapshot_id" : 7329471080018208648,
"added_data_files_count" : 0,
"existing_data_files_count" : 0,
"deleted_data_files_count" : 1,
"added_rows_count" : 0,
"existing_rows_count" : 0,
"deleted_rows_count" : 95137,
"partitions" : {
"array" : [ ]
}
00000-0-319a206d-7ead-415d-9ec8-700c1a49b8c4-00008.parquet "status" : 2 "content" : 2
}
{
"manifest_path" : "hdfs://test-hdfs1/user/hive/dc-warehouse/iceberg_cdc_table/metadata/f0bd795c-6a10-41bc-8f79-437fef1ff5f9-m5.avro",
"manifest_length" : 6570,
"partition_spec_id" : 0,
"content" : 1,
"sequence_number" : 6,
"min_sequence_number" : 6,
"added_snapshot_id" : 7329471080018208648,
"added_data_files_count" : 0,
"existing_data_files_count" : 0,
"deleted_data_files_count" : 1,
"added_rows_count" : 0,
"existing_rows_count" : 0,
"deleted_rows_count" : 73302,
"partitions" : {
"array" : [ ]
}
00000-0-319a206d-7ead-415d-9ec8-700c1a49b8c4-00006.parquet "status" : 2 "content" : 2
}
{
"manifest_path" : "hdfs://test-hdfs1/user/hive/dc-warehouse/iceberg_cdc_table/metadata/f0bd795c-6a10-41bc-8f79-437fef1ff5f9-m6.avro",
"manifest_length" : 6567,
"partition_spec_id" : 0,
"content" : 1,
"sequence_number" : 6,
"min_sequence_number" : 6,
"added_snapshot_id" : 7329471080018208648,
"added_data_files_count" : 0,
"existing_data_files_count" : 0,
"deleted_data_files_count" : 1,
"added_rows_count" : 0,
"existing_rows_count" : 0,
"deleted_rows_count" : 21892,
"partitions" : {
"array" : [ ]
}
00000-0-319a206d-7ead-415d-9ec8-700c1a49b8c4-00004.parquet "status" : 2 "content" : 2
}
这其中包含了所有的manifest文件,注意content属性,在ManifestContent 中定义了其意义,0表示新增数据Manifest,1表示删除数据Manifest
/**
* Content type stored in a manifest file, either DATA or DELETES.
*/
public enum ManifestContent {
DATA(0),
DELETES(1);
private final int id;
ManifestContent(int id) {
this.id = id;
}
public int id() {
return id;
}
}
查看manifest文件
{
"status" : 1,
"snapshot_id" : {
"long" : 7329471080018208648
},
"sequence_number" : null,
"data_file" : {
"content" : 0,
"file_path" : "hdfs://test-hdfs1/user/hive/dc-warehouse/iceberg_cdc_table/data/00000-0-79d89118-5069-4877-8332-2a592c887fe3-00001.parquet",
"file_format" : "PARQUET",
"partition" : { },
"record_count" : 1000000,
"file_size_in_bytes" : 3293597,
"column_sizes" : {
"array" : [ {
"key" : 1,
"value" : 2554588
}, {
"key" : 2,
"value" : 734455
} ]
},
"value_counts" : {
"array" : [ {
"key" : 1,
"value" : 1000000
}, {
"key" : 2,
"value" : 1000000
} ]
},
"null_value_counts" : {
"array" : [ {
"key" : 1,
"value" : 0
}, {
"key" : 2,
"value" : 0
} ]
},
"nan_value_counts" : {
"array" : [ ]
},
"lower_bounds" : {
"array" : [ {
"key" : 1,
"value" : "0"
}, {
"key" : 2,
"value" : "007-dacf7d6ae3f9"
} ]
},
"upper_bounds" : {
"array" : [ {
"key" : 1,
"value" : "999999"
}, {
"key" : 2,
"value" : "ff3-e85ff5b95460"
} ]
},
"key_metadata" : null,
"split_offsets" : {
"array" : [ 4 ]
},
"equality_ids" : null,
"sort_order_id" : {
"int" : 0
}
}
}
注意status属性,在ManifestEntry接口中定义了枚举
package org.apache.iceberg;
interface ManifestEntry<F extends ContentFile<F>> {
enum Status {
EXISTING(0),
ADDED(1),
DELETED(2);
private final int id;
Status(int id) {
this.id = id;
}
public int id() {
return id;
}
}
}
1表示添加的文件,2表示已经无效需要删除的文件
还有content属性,在FileContent 类中定义了其意义,0表示数据文件,1表示POSITION_DELETES文件,2表示 EQUALITY_DELETES文件
package org.apache.iceberg;
/**
* Content type stored in a file, one of DATA, POSITION_DELETES, or EQUALITY_DELETES.
*/
public enum FileContent {
DATA(0),
POSITION_DELETES(1),
EQUALITY_DELETES(2);
private final int id;
FileContent(int id) {
this.id = id;
}
public int id() {
return id;
}
}
上面的snapshot文件snap-7329471080018208648-1-f0bd795c-6a10-41bc-8f79-437fef1ff5f9.avro是最新的snapshot文件,有6个content为0的文件和4个content为1的文件,因为我这里是初始入了100w条cdc数据生成一个data文件,然后经历了4次updata,生成了4个data文件和4个delete文件,最后做了一个文件合并生成一个新的data文件。
我提取了其中对应的parquet文件和其status和content信息,state状态为1的有3个,即只有3个有效的文件,一个是进行小文件合并后生成的文件,两个是之后入库的更新文件,这两个也是一个是DATA文件一个是POSITION_DELETES文件。
而在小文件合并之前则是9个有效文件,5个data文件和4个POSITION_DELETES文件。