Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,9 @@

#include <algorithm>
#include <memory>
#include <string>

#include "olap/rowset/segment_v2/inverted_index/query_v2/nullable_scorer.h"
#include "olap/rowset/segment_v2/inverted_index/query_v2/query.h"
#include "olap/rowset/segment_v2/inverted_index/query_v2/scorer.h"
#include "olap/rowset/segment_v2/inverted_index/query_v2/weight.h"
Expand All @@ -34,14 +36,14 @@ using AllScorerPtr = std::shared_ptr<AllScorer>;
using AllWeightPtr = std::shared_ptr<AllWeight>;
using AllQueryPtr = std::shared_ptr<AllQuery>;

/// Scorer that matches all documents [0, max_doc).
/// Mirrors Lucene's MatchAllDocsQuery scorer with ConstantScoreWeight:
/// returns a constant score of 1.0 when scoring is enabled, 0.0 otherwise.
class AllScorer : public Scorer {
public:
explicit AllScorer(uint32_t max_doc) : _max_doc(max_doc) {
if (_max_doc == 0) {
_doc = TERMINATED;
} else {
_doc = 0;
}
AllScorer(uint32_t max_doc, bool enable_scoring)
: _max_doc(max_doc), _score(enable_scoring ? 1.0F : 0.0F) {
_doc = (_max_doc == 0) ? TERMINATED : 0;
}

~AllScorer() override = default;
Expand Down Expand Up @@ -72,41 +74,60 @@ class AllScorer : public Scorer {
return _doc;
}

float score() override { return 1.0F; }
float score() override { return _score; }

uint32_t size_hint() const override { return _max_doc; }

private:
uint32_t _max_doc = 0;
uint32_t _doc = TERMINATED;
float _score;
};

/// Weight for AllQuery. Analogous to Lucene's ConstantScoreWeight used by MatchAllDocsQuery.
class AllWeight : public Weight {
public:
explicit AllWeight(uint32_t max_doc) : _max_doc(max_doc) {}
explicit AllWeight(bool enable_scoring) : _enable_scoring(enable_scoring) {}

AllWeight(std::wstring field, bool nullable, bool enable_scoring)
: _field(std::move(field)), _nullable(nullable), _enable_scoring(enable_scoring) {}

~AllWeight() override = default;

ScorerPtr scorer(const QueryExecutionContext& context) override {
return std::make_shared<AllScorer>(_max_doc);
auto inner = std::make_shared<AllScorer>(context.segment_num_rows, _enable_scoring);
if (_nullable && context.null_resolver != nullptr) {
std::string logical = logical_field_or_fallback(context, "", _field);
return make_nullable_scorer(std::move(inner), logical, context.null_resolver);
}
return inner;
}

private:
uint32_t _max_doc = 0;
std::wstring _field;
bool _nullable = false;
bool _enable_scoring = false;
};

/// Query that matches all documents, analogous to Lucene's MatchAllDocsQuery.
/// Uses constant scoring (score = 1.0) like Lucene's ConstantScoreWeight.
class AllQuery : public Query {
public:
explicit AllQuery(uint32_t max_doc) : _max_doc(max_doc) {}
AllQuery() = default;
AllQuery(std::wstring field, bool nullable) : _field(std::move(field)), _nullable(nullable) {}

~AllQuery() override = default;

WeightPtr weight(bool /*enable_scoring*/) override {
return std::make_shared<AllWeight>(_max_doc);
WeightPtr weight(bool enable_scoring) override {
if (!_field.empty()) {
return std::make_shared<AllWeight>(_field, _nullable, enable_scoring);
}
return std::make_shared<AllWeight>(enable_scoring);
}

private:
uint32_t _max_doc = 0;
std::wstring _field;
bool _nullable = false;
};

} // namespace doris::segment_v2::inverted_index::query_v2
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ std::optional<CombinationMethod> OccurBooleanWeight<ScoreCombinerPtrT>::build_sh
} else if (adjusted_minimum == 1) {
return Required {scorer_union(std::move(should_scorers), combiner)};
} else if (adjusted_minimum == num_of_should_scorers) {
// All SHOULD clauses must match - move them to must_scorers (append, not swap)
for (auto& scorer : should_scorers) {
must_scorers.push_back(std::move(scorer));
}
Expand All @@ -137,7 +138,7 @@ ScorerPtr OccurBooleanWeight<ScoreCombinerPtrT>::effective_must_scorer(
std::vector<ScorerPtr> must_scorers, size_t must_num_all_scorers) {
if (must_scorers.empty()) {
if (must_num_all_scorers > 0) {
return std::make_shared<AllScorer>(_max_doc);
return std::make_shared<AllScorer>(_max_doc, _enable_scoring);
}
return nullptr;
}
Expand All @@ -152,10 +153,10 @@ SpecializedScorer OccurBooleanWeight<ScoreCombinerPtrT>::effective_should_scorer
if (_enable_scoring) {
std::vector<ScorerPtr> scorers;
scorers.push_back(into_box_scorer(std::move(should_scorer), combiner));
scorers.push_back(std::make_shared<AllScorer>(_max_doc));
scorers.push_back(std::make_shared<AllScorer>(_max_doc, _enable_scoring));
return make_buffered_union(std::move(scorers), combiner);
} else {
return std::make_shared<AllScorer>(_max_doc);
return std::make_shared<AllScorer>(_max_doc, _enable_scoring);
}
}
return should_scorer;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,9 @@ class RegexpWeight : public Weight {
std::string _pattern;
bool _enable_scoring = false;
bool _nullable = true;
int32_t _max_expansions = 50;
// Set to 0 to disable limit (ES has no default limit for prefix queries)
// The limit prevents collecting too many terms, but can cause incorrect results
int32_t _max_expansions = 0;
};

} // namespace doris::segment_v2::inverted_index::query_v2
14 changes: 11 additions & 3 deletions be/src/olap/tablet_schema.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
#include "olap/inverted_index_parser.h"
#include "olap/olap_common.h"
#include "olap/olap_define.h"
#include "olap/rowset/segment_v2/inverted_index/analyzer/analyzer.h"
#include "olap/tablet_column_object_pool.h"
#include "olap/types.h"
#include "olap/utils.h"
Expand Down Expand Up @@ -943,9 +944,16 @@ void TabletIndex::to_schema_pb(TabletIndexPB* index) const {

DBUG_EXECUTE_IF("tablet_schema.to_schema_pb", { return; })

// lowercase by default
if (!_properties.empty()) {
if (!_properties.contains(INVERTED_INDEX_PARSER_LOWERCASE_KEY)) {
// Only add lower_case=true default for built-in analyzers/parsers, NOT for custom analyzers
// Custom analyzer: lower_case is determined by analyzer's internal token filter
if (!_properties.empty() && !_properties.contains(INVERTED_INDEX_PARSER_LOWERCASE_KEY)) {
bool has_parser = _properties.contains(INVERTED_INDEX_PARSER_KEY) ||
_properties.contains(INVERTED_INDEX_PARSER_KEY_ALIAS);
std::string analyzer_name = get_analyzer_name_from_properties(_properties);
bool is_builtin = analyzer_name.empty() ||
segment_v2::inverted_index::InvertedIndexAnalyzer::is_builtin_analyzer(
analyzer_name);
if (has_parser || is_builtin) {
(*index->mutable_properties())[INVERTED_INDEX_PARSER_LOWERCASE_KEY] =
INVERTED_INDEX_PARSER_TRUE;
}
Expand Down
Loading
Loading