Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions be/src/olap/inverted_index_parser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,15 @@ CharFilterMap get_parser_char_filter_map_from_properties(
return char_filter_map;
}

std::string get_parser_ignore_above_value_from_properties(
const std::map<std::string, std::string>& properties) {
if (properties.find(INVERTED_INDEX_PARSER_IGNORE_ABOVE_KEY) != properties.end()) {
return properties.at(INVERTED_INDEX_PARSER_IGNORE_ABOVE_KEY);
} else {
return INVERTED_INDEX_PARSER_IGNORE_ABOVE_VALUE;
}
}

std::string get_parser_lowercase_from_properties(
const std::map<std::string, std::string>& properties) {
if (properties.find(INVERTED_INDEX_PARSER_LOWERCASE_KEY) != properties.end()) {
Expand Down
7 changes: 7 additions & 0 deletions be/src/olap/inverted_index_parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,9 @@ const std::string INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE = "char_filter_type";
const std::string INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN = "char_filter_pattern";
const std::string INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT = "char_filter_replacement";

const std::string INVERTED_INDEX_PARSER_IGNORE_ABOVE_KEY = "ignore_above";
const std::string INVERTED_INDEX_PARSER_IGNORE_ABOVE_VALUE = "256";

const std::string INVERTED_INDEX_PARSER_LOWERCASE_KEY = "lower_case";

std::string inverted_index_parser_type_to_string(InvertedIndexParserType parser_type);
Expand All @@ -84,6 +87,10 @@ std::string get_parser_phrase_support_string_from_properties(
CharFilterMap get_parser_char_filter_map_from_properties(
const std::map<std::string, std::string>& properties);

// get parser ignore_above value from properties
std::string get_parser_ignore_above_value_from_properties(
const std::map<std::string, std::string>& properties);

std::string get_parser_lowercase_from_properties(
const std::map<std::string, std::string>& properties);
} // namespace doris
34 changes: 30 additions & 4 deletions be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -294,9 +294,22 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter {
"field or index writer is null in inverted index writer");
}
auto* v = (Slice*)values;
auto ignore_above_value =
get_parser_ignore_above_value_from_properties(_index_meta->properties());
auto ignore_above = std::stoi(ignore_above_value);
for (int i = 0; i < count; ++i) {
new_fulltext_field(v->get_data(), v->get_size());
RETURN_IF_ERROR(add_document());
// only ignore_above UNTOKENIZED strings
if (_parser_type == InvertedIndexParserType::PARSER_NONE &&
v->get_size() > ignore_above) {
VLOG_DEBUG << "fulltext index value length can be at most "
<< ignore_above_value << ", but got "
<< "value length:" << v->get_size() << ", ignore this value";
new_fulltext_field(empty_value.c_str(), 0);
RETURN_IF_ERROR(add_null_document());
} else {
new_fulltext_field(v->get_data(), v->get_size());
RETURN_IF_ERROR(add_document());
}
++v;
_rid++;
}
Expand All @@ -319,6 +332,9 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter {
return Status::InternalError(
"field or index writer is null in inverted index writer");
}
auto ignore_above_value =
get_parser_ignore_above_value_from_properties(_index_meta->properties());
auto ignore_above = std::stoi(ignore_above_value);
for (int i = 0; i < count; ++i) {
// offsets[i+1] is now row element count
std::vector<std::string> strings;
Expand All @@ -335,9 +351,19 @@ class InvertedIndexColumnWriterImpl : public InvertedIndexColumnWriter {
}

auto value = join(strings, " ");
new_fulltext_field(value.c_str(), value.length());
// only ignore_above UNTOKENIZED strings
if (_parser_type == InvertedIndexParserType::PARSER_NONE &&
value.length() > ignore_above) {
VLOG_DEBUG << "fulltext index value length can be at most "
<< ignore_above_value << ", but got "
<< "value length:" << value.length() << ", ignore this value";
new_fulltext_field(empty_value.c_str(), 0);
RETURN_IF_ERROR(add_null_document());
} else {
new_fulltext_field(value.c_str(), value.length());
RETURN_IF_ERROR(add_document());
}
_rid++;
_index_writer->addDocument(_doc.get());
}
} else if constexpr (field_is_numeric_type(field_type)) {
for (int i = 0; i < count; ++i) {
Expand Down
3 changes: 3 additions & 0 deletions docs/en/docs/data-table/index/inverted-index.md
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,9 @@ The features for inverted index is as follows:
- char_replace: replace each char in the pattern with a char in the replacement
- char_filter_pattern: character array to be replaced
- char_filter_replacement: replaced character array, can be left unset, defaults to a space character
- ignore_above: Controls whether strings are indexed.
- Strings longer than the ignore_above setting will not be indexed. For arrays of strings, ignore_above will be applied for each array element separately and string elements longer than ignore_above will not be indexed.
- default value is 256 bytes.
- lower_case: Whether to convert tokens to lowercase, thereby achieving case-insensitive matching.
- true: Convert to lowercase
- false: Do not convert to lowercase
Expand Down
3 changes: 3 additions & 0 deletions docs/zh-CN/docs/data-table/index/inverted-index.md
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,9 @@ Doris倒排索引的功能简要介绍如下:
- char_replace 将pattern中每个char替换为一个replacement中的char
- char_filter_pattern:需要被替换掉的字符数组
- char_filter_replacement:替换后的字符数组,可以不用配置,默认为一个空格字符
- ignore_above:控制字符串是否建索引。
- 长度超过 ignore_above 设置的字符串不会被索引。对于字符串数组,ignore_above 将分别应用于每个数组元素,长度超过 ignore_above 的字符串元素将不被索引。
- 默认为 256 字节
- lower_case: 是否将分词进行小写转换,从而在匹配的时候实现忽略大小写
- true: 转换小写
- false:不转换小写
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ public class InvertedIndexUtil {

public static String INVERTED_INDEX_CHAR_FILTER_CHAR_REPLACE = "char_replace";

public static String INVERTED_INDEX_PARSER_IGNORE_ABOVE = "ignore_above";

public static String INVERTED_INDEX_PARSER_LOWERCASE = "lower_case";

public static String getInvertedIndexParser(Map<String, String> properties) {
Expand Down Expand Up @@ -100,6 +102,17 @@ public static void checkInvertedIndexParser(String indexColName, PrimitiveType c
if (parser == null && !properties.isEmpty()) {
throw new AnalysisException("invalid index properties, please check the properties");
}
String ignoreAbove = properties.get(INVERTED_INDEX_PARSER_IGNORE_ABOVE);
if (ignoreAbove != null) {
try {
int ignoreAboveValue = Integer.parseInt(ignoreAbove);
if (ignoreAboveValue <= 0) {
throw new AnalysisException("invalid index properties, ignore_above must be positive");
}
} catch (NumberFormatException e) {
throw new AnalysisException("invalid index properties, ignore_above must be integer");
}
}
String lowerCase = properties.get(INVERTED_INDEX_PARSER_LOWERCASE);
if (lowerCase != null) {
if (!"true".equals(lowerCase) && !"false".equals(lowerCase)) {
Expand Down