-
Notifications
You must be signed in to change notification settings - Fork 18
Antalya 26.1 : Fix Alter table operations(add, modify, drop, rename) support for Iceberg #1594
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: antalya-26.1
Are you sure you want to change the base?
Changes from all commits
5c0d0dd
6664736
681dba1
9f1fd8d
156b93f
48f96f9
d658734
f344d72
8882211
4ae5409
4f37d22
c29588a
c7e0667
822cadb
e2ae734
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -29,6 +29,7 @@ | |
| #include <IO/Operators.h> | ||
| #include <Interpreters/Context.h> | ||
|
|
||
| #include <Storages/ObjectStorage/DataLakes/Iceberg/Constant.h> | ||
| #include <Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h> | ||
| #include <Server/HTTP/HTMLForm.h> | ||
| #include <Formats/FormatFactory.h> | ||
|
|
@@ -42,6 +43,8 @@ | |
| #include <Poco/Net/SSLManager.h> | ||
| #include <Poco/StreamCopier.h> | ||
|
|
||
| #include <sstream> | ||
|
|
||
|
|
||
| namespace DB::ErrorCodes | ||
| { | ||
|
|
@@ -116,6 +119,141 @@ String encodeNamespaceForURI(const String & namespace_name) | |
|
|
||
| } | ||
|
|
||
| namespace | ||
| { | ||
| Poco::JSON::Object::Ptr cloneJsonObject(const Poco::JSON::Object::Ptr & obj) | ||
| { | ||
| std::ostringstream oss; // STYLE_CHECK_ALLOW_STD_STRING_STREAM | ||
| obj->stringify(oss); | ||
|
|
||
| Poco::JSON::Parser parser; | ||
| return parser.parse(oss.str()).extract<Poco::JSON::Object::Ptr>(); | ||
| } | ||
| } | ||
|
|
||
| Poco::JSON::Object::Ptr buildUpdateMetadataRequestBody( | ||
| const String & namespace_name, const String & table_name, Poco::JSON::Object::Ptr new_snapshot) | ||
| { | ||
| if (!new_snapshot) | ||
| return nullptr; | ||
|
|
||
| Poco::JSON::Object::Ptr request_body = new Poco::JSON::Object; | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wonder how they (poco lib) manage the life cycle of such objects
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. From poco docs Poco::JSON::Object::Ptr is a type alias for a Poco::SharedPtr managing a Poco::JSON::Object . |
||
| { | ||
| Poco::JSON::Object::Ptr identifier = new Poco::JSON::Object; | ||
| identifier->set("name", table_name); | ||
| Poco::JSON::Array::Ptr namespaces = new Poco::JSON::Array; | ||
| namespaces->add(namespace_name); | ||
| identifier->set("namespace", namespaces); | ||
|
|
||
| request_body->set("identifier", identifier); | ||
| } | ||
|
|
||
| // Schema-change commit path (ALTER TABLE add/drop/modify/rename column). | ||
| if (new_snapshot->has(DB::Iceberg::f_schemas)) | ||
| { | ||
| if (!new_snapshot->has(DB::Iceberg::f_current_schema_id)) | ||
| throw DB::Exception( | ||
| DB::ErrorCodes::DATALAKE_DATABASE_ERROR, | ||
| "Iceberg update-metadata for {}.{} is missing '{}' field", | ||
| namespace_name, table_name, DB::Iceberg::f_current_schema_id); | ||
|
|
||
| const Int32 new_schema_id = new_snapshot->getValue<Int32>(DB::Iceberg::f_current_schema_id); | ||
| // old_schema_id = new_schema_id - 1 is the ClickHouse-writer convention; old_schema_id >= 0 | ||
| // means "there is a previous schema, emit assert-current-schema-id as precondition". | ||
| const Int32 old_schema_id = new_schema_id - 1; | ||
|
|
||
| Poco::JSON::Object::Ptr new_schema_obj; | ||
| auto schemas = new_snapshot->getArray(DB::Iceberg::f_schemas); | ||
| for (UInt32 i = 0; i < schemas->size(); ++i) | ||
| { | ||
| auto s = schemas->getObject(i); | ||
| if (s->getValue<Int32>(DB::Iceberg::f_schema_id) == new_schema_id) | ||
| { | ||
| new_schema_obj = s; | ||
| break; | ||
| } | ||
| } | ||
| if (!new_schema_obj) | ||
| throw DB::Exception( | ||
| DB::ErrorCodes::DATALAKE_DATABASE_ERROR, | ||
| "Iceberg update-metadata for {}.{}: no schema object matching current-schema-id={}", | ||
| namespace_name, table_name, new_schema_id); | ||
|
|
||
| Poco::JSON::Object::Ptr schema_for_rest = cloneJsonObject(new_schema_obj); | ||
| if (!schema_for_rest->has("identifier-field-ids")) | ||
| { | ||
| Poco::JSON::Array::Ptr empty_identifier_field_ids = new Poco::JSON::Array; | ||
| schema_for_rest->set("identifier-field-ids", empty_identifier_field_ids); | ||
| } | ||
|
|
||
| if (old_schema_id >= 0) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I assume this means: is this not the first schema?
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I honestly dont know he specs, so I cant really give an opinion on the below fields
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes correct, the logic is |
||
| { | ||
| Poco::JSON::Object::Ptr requirement = new Poco::JSON::Object; | ||
| requirement->set("type", "assert-current-schema-id"); | ||
| requirement->set("current-schema-id", old_schema_id); | ||
|
|
||
| Poco::JSON::Array::Ptr requirements = new Poco::JSON::Array; | ||
| requirements->add(requirement); | ||
| request_body->set("requirements", requirements); | ||
| } | ||
|
|
||
| Poco::JSON::Array::Ptr updates = new Poco::JSON::Array; | ||
| { | ||
| Poco::JSON::Object::Ptr add_schema = new Poco::JSON::Object; | ||
| add_schema->set("action", "add-schema"); | ||
| add_schema->set("schema", schema_for_rest); | ||
| if (new_snapshot->has(DB::Iceberg::f_last_column_id)) | ||
| add_schema->set("last-column-id", new_snapshot->getValue<Int32>(DB::Iceberg::f_last_column_id)); | ||
| updates->add(add_schema); | ||
| } | ||
| { | ||
| Poco::JSON::Object::Ptr set_current_schema = new Poco::JSON::Object; | ||
| set_current_schema->set("action", "set-current-schema"); | ||
| set_current_schema->set("schema-id", new_schema_id); | ||
| updates->add(set_current_schema); | ||
| } | ||
| request_body->set("updates", updates); | ||
| } | ||
| else | ||
| { | ||
| // Snapshot-append commit path (INSERT / position-delete mutation). | ||
| if (new_snapshot->has("parent-snapshot-id")) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Again, I dont know the specs, but to my understand it is impossible that
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nvm, I confused the if blocks |
||
| { | ||
| auto parent_snapshot_id = new_snapshot->getValue<Int64>("parent-snapshot-id"); | ||
| if (parent_snapshot_id != -1) | ||
| { | ||
| Poco::JSON::Object::Ptr requirement = new Poco::JSON::Object; | ||
| requirement->set("type", "assert-ref-snapshot-id"); | ||
| requirement->set("ref", "main"); | ||
| requirement->set("snapshot-id", parent_snapshot_id); | ||
|
|
||
| Poco::JSON::Array::Ptr requirements = new Poco::JSON::Array; | ||
| requirements->add(requirement); | ||
| request_body->set("requirements", requirements); | ||
| } | ||
| } | ||
|
|
||
| Poco::JSON::Array::Ptr updates = new Poco::JSON::Array; | ||
| { | ||
| Poco::JSON::Object::Ptr add_snapshot = new Poco::JSON::Object; | ||
| add_snapshot->set("action", "add-snapshot"); | ||
| add_snapshot->set("snapshot", new_snapshot); | ||
| updates->add(add_snapshot); | ||
| } | ||
| { | ||
| Poco::JSON::Object::Ptr set_snapshot = new Poco::JSON::Object; | ||
| set_snapshot->set("action", "set-snapshot-ref"); | ||
| set_snapshot->set("ref-name", "main"); | ||
| set_snapshot->set("type", "branch"); | ||
| set_snapshot->set("snapshot-id", new_snapshot->getValue<Int64>("snapshot-id")); | ||
| updates->add(set_snapshot); | ||
| } | ||
| request_body->set("updates", updates); | ||
| } | ||
|
|
||
| return request_body; | ||
| } | ||
|
|
||
| std::string RestCatalog::Config::toString() const | ||
| { | ||
| DB::WriteBufferFromOwnString wb; | ||
|
|
@@ -1294,62 +1432,19 @@ bool RestCatalog::updateMetadata(const String & namespace_name, const String & t | |
| { | ||
| const std::string endpoint = fmt::format("{}/namespaces/{}/tables/{}", base_url, namespace_name, table_name); | ||
|
|
||
| Poco::JSON::Object::Ptr request_body = new Poco::JSON::Object; | ||
| { | ||
| Poco::JSON::Object::Ptr identifier = new Poco::JSON::Object; | ||
| identifier->set("name", table_name); | ||
| Poco::JSON::Array::Ptr namespaces = new Poco::JSON::Array; | ||
| namespaces->add(namespace_name); | ||
| identifier->set("namespace", namespaces); | ||
|
|
||
| request_body->set("identifier", identifier); | ||
| } | ||
|
|
||
| if (new_snapshot->has("parent-snapshot-id")) | ||
| { | ||
| auto parent_snapshot_id = new_snapshot->getValue<Int64>("parent-snapshot-id"); | ||
| if (parent_snapshot_id != -1) | ||
| { | ||
| Poco::JSON::Object::Ptr requirement = new Poco::JSON::Object; | ||
| requirement->set("type", "assert-ref-snapshot-id"); | ||
| requirement->set("ref", "main"); | ||
| requirement->set("snapshot-id", parent_snapshot_id); | ||
|
|
||
| Poco::JSON::Array::Ptr requirements = new Poco::JSON::Array; | ||
| requirements->add(requirement); | ||
|
|
||
| request_body->set("requirements", requirements); | ||
| } | ||
| } | ||
|
|
||
| { | ||
| Poco::JSON::Array::Ptr updates = new Poco::JSON::Array; | ||
|
|
||
| { | ||
| Poco::JSON::Object::Ptr add_snapshot = new Poco::JSON::Object; | ||
| add_snapshot->set("action", "add-snapshot"); | ||
| add_snapshot->set("snapshot", new_snapshot); | ||
| updates->add(add_snapshot); | ||
| } | ||
|
|
||
| { | ||
| Poco::JSON::Object::Ptr set_snapshot = new Poco::JSON::Object; | ||
| set_snapshot->set("action", "set-snapshot-ref"); | ||
| set_snapshot->set("ref-name", "main"); | ||
| set_snapshot->set("type", "branch"); | ||
| set_snapshot->set("snapshot-id", new_snapshot->getValue<Int64>("snapshot-id")); | ||
|
|
||
| updates->add(set_snapshot); | ||
| } | ||
| request_body->set("updates", updates); | ||
| } | ||
| // Throws DB::Exception(DATALAKE_DATABASE_ERROR) on malformed metadata (programming error). | ||
| auto request_body = buildUpdateMetadataRequestBody(namespace_name, table_name, new_snapshot); | ||
| if (!request_body) | ||
| return true; // nothing to commit | ||
|
|
||
| try | ||
| { | ||
| sendRequest(endpoint, request_body); | ||
| } | ||
| catch (const DB::HTTPException &) | ||
| catch (const DB::HTTPException & ex) | ||
| { | ||
| LOG_WARNING(log, "Iceberg REST updateMetadata for {}.{} failed: {}", | ||
| namespace_name, table_name, ex.displayText()); | ||
| return false; | ||
| } | ||
| return true; | ||
|
|
||

There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
wow, is this really the best way to clone it? I havent checked th rest of the code yet, but I am curious to understand why this is needed
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ok, I understand why you need it. I also see poco does not offer a deep copy APi, so this is ok