-
Notifications
You must be signed in to change notification settings - Fork 3.8k
[opt](scan) Use lazy-init for segment iterators and avoid caching all segments in the rowset reader #35432
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[opt](scan) Use lazy-init for segment iterators and avoid caching all segments in the rowset reader #35432
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -38,6 +38,7 @@ | |
| #include "olap/row_cursor.h" | ||
| #include "olap/rowset/rowset_meta.h" | ||
| #include "olap/rowset/rowset_reader_context.h" | ||
| #include "olap/rowset/segment_v2/lazy_init_segment_iterator.h" | ||
| #include "olap/rowset/segment_v2/segment.h" | ||
| #include "olap/schema.h" | ||
| #include "olap/schema_cache.h" | ||
|
|
@@ -249,38 +250,66 @@ Status BetaRowsetReader::get_segment_iterators(RowsetReaderContext* read_context | |
|
|
||
| // load segments | ||
| bool should_use_cache = use_cache || _read_context->reader_type == ReaderType::READER_QUERY; | ||
| RETURN_IF_ERROR(SegmentLoader::instance()->load_segments(_rowset, &_segment_cache_handle, | ||
| SegmentCacheHandle segment_cache_handle; | ||
| RETURN_IF_ERROR(SegmentLoader::instance()->load_segments(_rowset, &segment_cache_handle, | ||
| should_use_cache)); | ||
|
|
||
| // create iterator for each segment | ||
| auto& segments = _segment_cache_handle.get_segments(); | ||
| auto& segments = segment_cache_handle.get_segments(); | ||
| _segments_rows.resize(segments.size()); | ||
| for (size_t i = 0; i < segments.size(); i++) { | ||
| _segments_rows[i] = segments[i]->num_rows(); | ||
| } | ||
|
|
||
| auto [seg_start, seg_end] = _segment_offsets; | ||
| if (seg_start == seg_end) { | ||
| seg_start = 0; | ||
| seg_end = segments.size(); | ||
| } | ||
|
|
||
| const bool is_merge_iterator = _is_merge_iterator(); | ||
| const bool use_lazy_init_iterators = | ||
| !is_merge_iterator && _read_context->reader_type == ReaderType::READER_QUERY; | ||
| for (int i = seg_start; i < seg_end; i++) { | ||
| auto& seg_ptr = segments[i]; | ||
| std::unique_ptr<RowwiseIterator> iter; | ||
| Status status; | ||
|
|
||
| /// If `_segment_row_ranges` is empty, the segment is not split. | ||
| if (_segment_row_ranges.empty()) { | ||
| _read_options.row_ranges.clear(); | ||
| status = seg_ptr->new_iterator(_input_schema, _read_options, &iter); | ||
| if (use_lazy_init_iterators) { | ||
| /// For non-merging iterators, we don't need to initialize them all at once when creating them. | ||
| /// Instead, we should initialize each iterator separately when really using them. | ||
| /// This optimization minimizes the lifecycle of resources like column readers | ||
| /// and prevents excessive memory consumption, especially for wide tables. | ||
| if (_segment_row_ranges.empty()) { | ||
| _read_options.row_ranges.clear(); | ||
| iter = std::make_unique<LazyInitSegmentIterator>(seg_ptr, _input_schema, | ||
| _read_options); | ||
| } else { | ||
| DCHECK_EQ(seg_end - seg_start, _segment_row_ranges.size()); | ||
| auto local_options = _read_options; | ||
| local_options.row_ranges = _segment_row_ranges[i - seg_start]; | ||
| iter = std::make_unique<LazyInitSegmentIterator>(seg_ptr, _input_schema, | ||
| local_options); | ||
| } | ||
| } else { | ||
| DCHECK_EQ(seg_end - seg_start, _segment_row_ranges.size()); | ||
| auto local_options = _read_options; | ||
| local_options.row_ranges = _segment_row_ranges[i - seg_start]; | ||
| status = seg_ptr->new_iterator(_input_schema, local_options, &iter); | ||
| } | ||
| Status status; | ||
| /// If `_segment_row_ranges` is empty, the segment is not split. | ||
| if (_segment_row_ranges.empty()) { | ||
| _read_options.row_ranges.clear(); | ||
| status = seg_ptr->new_iterator(_input_schema, _read_options, &iter); | ||
| } else { | ||
| DCHECK_EQ(seg_end - seg_start, _segment_row_ranges.size()); | ||
| auto local_options = _read_options; | ||
| local_options.row_ranges = _segment_row_ranges[i - seg_start]; | ||
| status = seg_ptr->new_iterator(_input_schema, local_options, &iter); | ||
| } | ||
|
|
||
| if (!status.ok()) { | ||
| LOG(WARNING) << "failed to create iterator[" << seg_ptr->id() | ||
| << "]: " << status.to_string(); | ||
| return Status::Error<ROWSET_READER_INIT>(status.to_string()); | ||
| if (!status.ok()) { | ||
| LOG(WARNING) << "failed to create iterator[" << seg_ptr->id() | ||
| << "]: " << status.to_string(); | ||
| return Status::Error<ROWSET_READER_INIT>(status.to_string()); | ||
| } | ||
| } | ||
|
|
||
| if (iter->empty()) { | ||
| continue; | ||
| } | ||
|
|
@@ -388,11 +417,7 @@ bool BetaRowsetReader::_should_push_down_value_predicates() const { | |
| } | ||
|
|
||
| Status BetaRowsetReader::get_segment_num_rows(std::vector<uint32_t>* segment_num_rows) { | ||
| auto& seg_ptrs = _segment_cache_handle.get_segments(); | ||
| segment_num_rows->resize(seg_ptrs.size()); | ||
| for (size_t i = 0; i < seg_ptrs.size(); i++) { | ||
| (*segment_num_rows)[i] = seg_ptrs[i]->num_rows(); | ||
| } | ||
| segment_num_rows->assign(_segments_rows.cbegin(), _segments_rows.cend()); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. could you make sure that this method is called after get_segment_iterators??? I think it is very dangerous to make such assumption |
||
| return Status::OK(); | ||
| } | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. some method like new_column_iterator also need load column readers |
||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,38 @@ | ||
| // Licensed to the Apache Software Foundation (ASF) under one | ||
| // or more contributor license agreements. See the NOTICE file | ||
| // distributed with this work for additional information | ||
| // regarding copyright ownership. The ASF licenses this file | ||
| // to you under the Apache License, Version 2.0 (the | ||
| // "License"); you may not use this file except in compliance | ||
| // with the License. You may obtain a copy of the License at | ||
| // | ||
| // http://www.apache.org/licenses/LICENSE-2.0 | ||
| // | ||
| // Unless required by applicable law or agreed to in writing, | ||
| // software distributed under the License is distributed on an | ||
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| // KIND, either express or implied. See the License for the | ||
| // specific language governing permissions and limitations | ||
| // under the License. | ||
|
|
||
| #include "olap/rowset/segment_v2/lazy_init_segment_iterator.h" | ||
|
|
||
| namespace doris::segment_v2 { | ||
|
|
||
| LazyInitSegmentIterator::LazyInitSegmentIterator(std::shared_ptr<Segment> segment, | ||
| SchemaSPtr schema, const StorageReadOptions& opts) | ||
| : _schema(std::move(schema)), _segment(std::move(segment)), _read_options(opts) {} | ||
|
|
||
| /// Here do not use the argument of `opts`, | ||
| /// see where the iterator is created in `BetaRowsetReader::get_segment_iterators` | ||
| Status LazyInitSegmentIterator::init(const StorageReadOptions& /*opts*/) { | ||
| _need_lazy_init = false; | ||
| if (_inner_iterator) { | ||
| return Status::OK(); | ||
| } | ||
|
|
||
| RETURN_IF_ERROR(_segment->new_iterator(_schema, _read_options, &_inner_iterator)); | ||
| return _inner_iterator->init(_read_options); | ||
| } | ||
|
|
||
| } // namespace doris::segment_v2 |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,67 @@ | ||
| // Licensed to the Apache Software Foundation (ASF) under one | ||
| // or more contributor license agreements. See the NOTICE file | ||
| // distributed with this work for additional information | ||
| // regarding copyright ownership. The ASF licenses this file | ||
| // to you under the Apache License, Version 2.0 (the | ||
| // "License"); you may not use this file except in compliance | ||
| // with the License. You may obtain a copy of the License at | ||
| // | ||
| // http://www.apache.org/licenses/LICENSE-2.0 | ||
| // | ||
| // Unless required by applicable law or agreed to in writing, | ||
| // software distributed under the License is distributed on an | ||
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
| // KIND, either express or implied. See the License for the | ||
| // specific language governing permissions and limitations | ||
| // under the License. | ||
|
|
||
| #pragma once | ||
|
|
||
| #include "olap/rowset/segment_v2/common.h" | ||
| #include "olap/rowset/segment_v2/segment.h" | ||
| #include "olap/rowset/segment_v2/segment_iterator.h" | ||
| #include "vec/core/block.h" | ||
|
|
||
| namespace doris::segment_v2 { | ||
|
|
||
| using namespace vectorized; | ||
|
|
||
| class LazyInitSegmentIterator : public RowwiseIterator { | ||
| public: | ||
| LazyInitSegmentIterator(std::shared_ptr<Segment> segment, SchemaSPtr schema, | ||
| const StorageReadOptions& opts); | ||
|
|
||
| ~LazyInitSegmentIterator() override = default; | ||
|
|
||
| Status init(const StorageReadOptions& opts) override; | ||
|
|
||
| Status next_batch(Block* block) override { | ||
| if (UNLIKELY(_need_lazy_init)) { | ||
| RETURN_IF_ERROR(init(_read_options)); | ||
| DCHECK(_inner_iterator != nullptr); | ||
| } | ||
|
|
||
| return _inner_iterator->next_batch(block); | ||
| } | ||
|
|
||
| const Schema& schema() const override { return *_schema; } | ||
|
|
||
| Status current_block_row_locations(std::vector<RowLocation>* locations) override { | ||
| return _inner_iterator->current_block_row_locations(locations); | ||
| } | ||
|
|
||
| bool update_profile(RuntimeProfile* profile) override { | ||
| if (_inner_iterator != nullptr) { | ||
| return _inner_iterator->update_profile(profile); | ||
| } | ||
| return false; | ||
| } | ||
|
|
||
| private: | ||
| bool _need_lazy_init {true}; | ||
| SchemaSPtr _schema = nullptr; | ||
| std::shared_ptr<Segment> _segment; | ||
| StorageReadOptions _read_options; | ||
| RowwiseIteratorUPtr _inner_iterator; | ||
| }; | ||
| } // namespace doris::segment_v2 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why need check reader type?