Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 22 additions & 1 deletion docs/en/sql-reference/table-functions/iceberg.md
Original file line number Diff line number Diff line change
Expand Up @@ -420,7 +420,7 @@ y: 993

### Schema evolution {#iceberg-writes-schema-evolution}

ClickHouse allows you to add, drop, or modify columns with simple types (non-tuple, non-array, non-map).
ClickHouse allows you to add, drop, modify, or rename columns with simple types (non-tuple, non-array, non-map).

### Example {#example-iceberg-writes-evolution}

Expand Down Expand Up @@ -479,6 +479,27 @@ Row 1:
──────
x: Ivanov
y: 993

ALTER TABLE iceberg_writes_example RENAME COLUMN y TO value;
SHOW CREATE TABLE iceberg_writes_example;

┌─statement─────────────────────────────────────────────────┐
1. │ CREATE TABLE default.iceberg_writes_example ↴│
│↳( ↴│
│↳ `x` Nullable(String), ↴│
│↳ `value` Nullable(Int64) ↴│
│↳) ↴│
│↳ENGINE = IcebergLocal('/home/scanhex12/iceberg_example/') │
└───────────────────────────────────────────────────────────┘

SELECT *
FROM iceberg_writes_example
FORMAT VERTICAL;

Row 1:
──────
x: Ivanov
value: 993
```

### Compaction {#iceberg-writes-compaction}
Expand Down
195 changes: 145 additions & 50 deletions src/Databases/DataLake/RestCatalog.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
#include <IO/Operators.h>
#include <Interpreters/Context.h>

#include <Storages/ObjectStorage/DataLakes/Iceberg/Constant.h>
#include <Storages/ObjectStorage/DataLakes/Iceberg/IcebergMetadata.h>
#include <Server/HTTP/HTMLForm.h>
#include <Formats/FormatFactory.h>
Expand All @@ -42,6 +43,8 @@
#include <Poco/Net/SSLManager.h>
#include <Poco/StreamCopier.h>

#include <sstream>


namespace DB::ErrorCodes
{
Expand Down Expand Up @@ -116,6 +119,141 @@ String encodeNamespaceForURI(const String & namespace_name)

}

namespace
{
Poco::JSON::Object::Ptr cloneJsonObject(const Poco::JSON::Object::Ptr & obj)
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

wow, is this really the best way to clone it? I havent checked th rest of the code yet, but I am curious to understand why this is needed

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ok, I understand why you need it. I also see poco does not offer a deep copy APi, so this is ok

{
std::ostringstream oss; // STYLE_CHECK_ALLOW_STD_STRING_STREAM
obj->stringify(oss);

Poco::JSON::Parser parser;
return parser.parse(oss.str()).extract<Poco::JSON::Object::Ptr>();
}
}

Poco::JSON::Object::Ptr buildUpdateMetadataRequestBody(
const String & namespace_name, const String & table_name, Poco::JSON::Object::Ptr new_snapshot)
{
if (!new_snapshot)
return nullptr;

Poco::JSON::Object::Ptr request_body = new Poco::JSON::Object;
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder how they (poco lib) manage the life cycle of such objects

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From poco docs Poco::JSON::Object::Ptr is a type alias for a Poco::SharedPtr managing a Poco::JSON::Object .
Its a reference counting smart pointer

{
Poco::JSON::Object::Ptr identifier = new Poco::JSON::Object;
identifier->set("name", table_name);
Poco::JSON::Array::Ptr namespaces = new Poco::JSON::Array;
namespaces->add(namespace_name);
identifier->set("namespace", namespaces);

request_body->set("identifier", identifier);
}

// Schema-change commit path (ALTER TABLE add/drop/modify/rename column).
if (new_snapshot->has(DB::Iceberg::f_schemas))
{
if (!new_snapshot->has(DB::Iceberg::f_current_schema_id))
throw DB::Exception(
DB::ErrorCodes::DATALAKE_DATABASE_ERROR,
"Iceberg update-metadata for {}.{} is missing '{}' field",
namespace_name, table_name, DB::Iceberg::f_current_schema_id);

const Int32 new_schema_id = new_snapshot->getValue<Int32>(DB::Iceberg::f_current_schema_id);
// old_schema_id = new_schema_id - 1 is the ClickHouse-writer convention; old_schema_id >= 0
// means "there is a previous schema, emit assert-current-schema-id as precondition".
const Int32 old_schema_id = new_schema_id - 1;

Poco::JSON::Object::Ptr new_schema_obj;
auto schemas = new_snapshot->getArray(DB::Iceberg::f_schemas);
for (UInt32 i = 0; i < schemas->size(); ++i)
{
auto s = schemas->getObject(i);
if (s->getValue<Int32>(DB::Iceberg::f_schema_id) == new_schema_id)
{
new_schema_obj = s;
break;
}
}
if (!new_schema_obj)
throw DB::Exception(
DB::ErrorCodes::DATALAKE_DATABASE_ERROR,
"Iceberg update-metadata for {}.{}: no schema object matching current-schema-id={}",
namespace_name, table_name, new_schema_id);

Poco::JSON::Object::Ptr schema_for_rest = cloneJsonObject(new_schema_obj);
if (!schema_for_rest->has("identifier-field-ids"))
{
Poco::JSON::Array::Ptr empty_identifier_field_ids = new Poco::JSON::Array;
schema_for_rest->set("identifier-field-ids", empty_identifier_field_ids);
}

if (old_schema_id >= 0)
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I assume this means: is this not the first schema?

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I honestly dont know he specs, so I cant really give an opinion on the below fields

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes correct, the logic is const Int32 old_schema_id = new_schema_id - 1; if a current schema exists, then add it to the assert-current-schema-id in the request

{
  "identifier": {
    "namespace": ["sales", "events"],
    "name": "orders"
  },
  "requirements": [
    { "type": "assert-table-uuid", "uuid": "<table-uuid-from-loadTable>" },
    { "type": "assert-current-schema-id", "current-schema-id": 3 }
  ],
  "updates": [
    {
      "action": "rename-column",
      "name": "old_name",
      "new-name": "new_name"
    }
  ]
}

https://github.com/Altinity/ClickHouse/pull/1594/changes/BASE..4f37d22fc562d75bdad4f41d414125a966520964#diff-a70dee27f63a76254ba9d7e7f9dc632d5cdee3d19be91663d04686247582e6faR167

{
Poco::JSON::Object::Ptr requirement = new Poco::JSON::Object;
requirement->set("type", "assert-current-schema-id");
requirement->set("current-schema-id", old_schema_id);

Poco::JSON::Array::Ptr requirements = new Poco::JSON::Array;
requirements->add(requirement);
request_body->set("requirements", requirements);
}

Poco::JSON::Array::Ptr updates = new Poco::JSON::Array;
{
Poco::JSON::Object::Ptr add_schema = new Poco::JSON::Object;
add_schema->set("action", "add-schema");
add_schema->set("schema", schema_for_rest);
if (new_snapshot->has(DB::Iceberg::f_last_column_id))
add_schema->set("last-column-id", new_snapshot->getValue<Int32>(DB::Iceberg::f_last_column_id));
updates->add(add_schema);
}
{
Poco::JSON::Object::Ptr set_current_schema = new Poco::JSON::Object;
set_current_schema->set("action", "set-current-schema");
set_current_schema->set("schema-id", new_schema_id);
updates->add(set_current_schema);
}
request_body->set("updates", updates);
}
else
{
// Snapshot-append commit path (INSERT / position-delete mutation).
if (new_snapshot->has("parent-snapshot-id"))
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Again, I dont know the specs, but to my understand it is impossible that old_schema_id < 0 while parent_snapshot_id != -1. Please educate me

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Image The else part https://github.com/Altinity/ClickHouse/pull/1594/changes/BASE..4f37d22fc562d75bdad4f41d414125a966520964#diff-a70dee27f63a76254ba9d7e7f9dc632d5cdee3d19be91663d04686247582e6faL1308 is from pre-existing code and is not related to updating schemas, is used for insert/append snapshot. I just moved the code to a separate function as its both related to updating the REST API request.

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nvm, I confused the if blocks

{
auto parent_snapshot_id = new_snapshot->getValue<Int64>("parent-snapshot-id");
if (parent_snapshot_id != -1)
{
Poco::JSON::Object::Ptr requirement = new Poco::JSON::Object;
requirement->set("type", "assert-ref-snapshot-id");
requirement->set("ref", "main");
requirement->set("snapshot-id", parent_snapshot_id);

Poco::JSON::Array::Ptr requirements = new Poco::JSON::Array;
requirements->add(requirement);
request_body->set("requirements", requirements);
}
}

Poco::JSON::Array::Ptr updates = new Poco::JSON::Array;
{
Poco::JSON::Object::Ptr add_snapshot = new Poco::JSON::Object;
add_snapshot->set("action", "add-snapshot");
add_snapshot->set("snapshot", new_snapshot);
updates->add(add_snapshot);
}
{
Poco::JSON::Object::Ptr set_snapshot = new Poco::JSON::Object;
set_snapshot->set("action", "set-snapshot-ref");
set_snapshot->set("ref-name", "main");
set_snapshot->set("type", "branch");
set_snapshot->set("snapshot-id", new_snapshot->getValue<Int64>("snapshot-id"));
updates->add(set_snapshot);
}
request_body->set("updates", updates);
}

return request_body;
}

std::string RestCatalog::Config::toString() const
{
DB::WriteBufferFromOwnString wb;
Expand Down Expand Up @@ -1294,62 +1432,19 @@ bool RestCatalog::updateMetadata(const String & namespace_name, const String & t
{
const std::string endpoint = fmt::format("{}/namespaces/{}/tables/{}", base_url, namespace_name, table_name);

Poco::JSON::Object::Ptr request_body = new Poco::JSON::Object;
{
Poco::JSON::Object::Ptr identifier = new Poco::JSON::Object;
identifier->set("name", table_name);
Poco::JSON::Array::Ptr namespaces = new Poco::JSON::Array;
namespaces->add(namespace_name);
identifier->set("namespace", namespaces);

request_body->set("identifier", identifier);
}

if (new_snapshot->has("parent-snapshot-id"))
{
auto parent_snapshot_id = new_snapshot->getValue<Int64>("parent-snapshot-id");
if (parent_snapshot_id != -1)
{
Poco::JSON::Object::Ptr requirement = new Poco::JSON::Object;
requirement->set("type", "assert-ref-snapshot-id");
requirement->set("ref", "main");
requirement->set("snapshot-id", parent_snapshot_id);

Poco::JSON::Array::Ptr requirements = new Poco::JSON::Array;
requirements->add(requirement);

request_body->set("requirements", requirements);
}
}

{
Poco::JSON::Array::Ptr updates = new Poco::JSON::Array;

{
Poco::JSON::Object::Ptr add_snapshot = new Poco::JSON::Object;
add_snapshot->set("action", "add-snapshot");
add_snapshot->set("snapshot", new_snapshot);
updates->add(add_snapshot);
}

{
Poco::JSON::Object::Ptr set_snapshot = new Poco::JSON::Object;
set_snapshot->set("action", "set-snapshot-ref");
set_snapshot->set("ref-name", "main");
set_snapshot->set("type", "branch");
set_snapshot->set("snapshot-id", new_snapshot->getValue<Int64>("snapshot-id"));

updates->add(set_snapshot);
}
request_body->set("updates", updates);
}
// Throws DB::Exception(DATALAKE_DATABASE_ERROR) on malformed metadata (programming error).
auto request_body = buildUpdateMetadataRequestBody(namespace_name, table_name, new_snapshot);
if (!request_body)
return true; // nothing to commit

try
{
sendRequest(endpoint, request_body);
}
catch (const DB::HTTPException &)
catch (const DB::HTTPException & ex)
{
LOG_WARNING(log, "Iceberg REST updateMetadata for {}.{} failed: {}",
namespace_name, table_name, ex.displayText());
return false;
}
return true;
Expand Down
10 changes: 10 additions & 0 deletions src/Databases/DataLake/RestCatalog.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,16 @@ struct AccessToken
}
};

/// Builds the JSON body for `POST .../namespaces/{ns}/tables/{table}` (Iceberg REST update).
///
/// Returns `nullptr` when `new_snapshot` is null (nothing to commit). Throws
/// `DB::Exception(DATALAKE_DATABASE_ERROR)` with a specific message when the metadata
/// blob is malformed (e.g. missing `current-schema-id`, no schema object matching it).
Poco::JSON::Object::Ptr buildUpdateMetadataRequestBody(
const String & namespace_name,
const String & table_name,
Poco::JSON::Object::Ptr new_snapshot);

class RestCatalog : public ICatalog, public DB::WithContext
{
public:
Expand Down
Loading
Loading