Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/Storages/ObjectStorage/StorageObjectStorageSource.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -885,7 +885,7 @@ StorageObjectStorageSource::ReaderHolder StorageObjectStorageSource::createReade

InputFormatPtr input_format;
if (context_->getSettingsRef()[Setting::use_parquet_metadata_cache] && use_native_reader_v3
&& (object_info->getFileFormat().value_or(configuration->getFormat()) == "Parquet")
&& (Poco::toLower(object_info->getFileFormat().value_or(configuration->getFormat())) == "parquet")
&& !object_info->getObjectMetadata()->etag.empty())
{
const std::optional<RelativePathWithMetadata> object_with_metadata = object_info->relative_path_with_metadata;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,18 @@ def execute_spark_query(query: str):
for replica in started_cluster_iceberg_with_spark.instances.values():
replica.query("SYSTEM FLUSH LOGS")

# Number of object-get requests per data file that are NOT served from caches
# after the warmup query above. The parquet metadata cache (enabled by default)
# caches the parquet footer keyed by the object's etag; the warmup query then
# populates it, so any subsequent read of the same file skips one object-get
# (the footer read). However, AzureObjectStorage::getObjectMetadata does NOT
# populate etag, so the cache guard `!etag.empty()` in
# StorageObjectStorageSource::createReader always fails for Azure, and the
# cache path is never taken there. As a result the multiplier is:
# S3: 2 (footer served from cache, data-only gets remain)
# Azure: 3 (cache never engaged, footer + data gets)
per_file_gets = 2 if storage_type == "s3" else 3

def check_events(query_id, event, is_cluster, expected):
res = instance.query(
f"""
Expand All @@ -183,11 +195,12 @@ def check_events(query_id, event, is_cluster, expected):
GROUP BY ALL
FORMAT CSV
""")
# Weird, bu looks like ReadFileMetadata does not used local file cache in 26.1
# Weird, but looks like ReadFileMetadata does not used local file cache in 26.1
# metadata.json always downloaded in 26.1, once per query or subquery
# In 25.8 count was equal to expected, in 26.1 it is expected * 3 + 1 for Local case
# expected * 3 + 4 for Cluster case, because each subquery loads mettadata.json
assert int(res) == expected * 3 + (4 if is_cluster else 1)
# In 25.8 count was equal to expected, in 26.1 it is expected * N + 1 for Local
# case and expected * N + 4 for Cluster case (each subquery loads metadata.json).
# N = per_file_gets (see comment above the function).
assert int(res) == expected * per_file_gets + (4 if is_cluster else 1)

event = "S3GetObject" if storage_type == "s3" else "AzureGetObject"

Expand All @@ -212,4 +225,4 @@ def compare_selects(query):

compare_selects(f"SELECT _path,* FROM {creation_expression} ORDER BY ALL")
compare_selects(f"SELECT _path,* FROM {creation_expression} WHERE name_old='vasily' ORDER BY ALL")
compare_selects(f"SELECT _path,* FROM {creation_expression} WHERE ((tag + length(name_old)) % 2 = 1) ORDER BY ALL")
compare_selects(f"SELECT _path,* FROM {creation_expression} WHERE ((tag + length(name_old)) % 2 = 1) ORDER BY ALL")
Loading