Skip to content

Commit d498425

Browse files
authored
[opt](inverted index) add custom analyzer support with char_filter, basic and icu tokenizer (#57266)
#56243 #57055
1 parent e7489e2 commit d498425

File tree

95 files changed

+3000
-474
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

95 files changed

+3000
-474
lines changed

be/src/olap/inverted_index_parser.cpp

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717

1818
#include "olap/inverted_index_parser.h"
1919

20-
#include "olap/rowset/segment_v2/inverted_index/char_filter/char_filter_factory.h"
2120
#include "util/string_util.h"
2221

2322
namespace doris {
@@ -70,11 +69,15 @@ InvertedIndexParserType get_inverted_index_parser_type_from_string(const std::st
7069

7170
std::string get_parser_string_from_properties(
7271
const std::map<std::string, std::string>& properties) {
73-
if (properties.find(INVERTED_INDEX_PARSER_KEY) != properties.end()) {
74-
return properties.at(INVERTED_INDEX_PARSER_KEY);
75-
} else {
76-
return INVERTED_INDEX_PARSER_NONE;
72+
auto it = properties.find(INVERTED_INDEX_PARSER_KEY);
73+
if (it != properties.end()) {
74+
return it->second;
7775
}
76+
it = properties.find(INVERTED_INDEX_PARSER_KEY_ALIAS);
77+
if (it != properties.end()) {
78+
return it->second;
79+
}
80+
return INVERTED_INDEX_PARSER_NONE;
7881
}
7982

8083
std::string get_parser_mode_string_from_properties(
@@ -83,6 +86,9 @@ std::string get_parser_mode_string_from_properties(
8386
return properties.at(INVERTED_INDEX_PARSER_MODE_KEY);
8487
} else {
8588
auto parser_it = properties.find(INVERTED_INDEX_PARSER_KEY);
89+
if (parser_it == properties.end()) {
90+
parser_it = properties.find(INVERTED_INDEX_PARSER_KEY_ALIAS);
91+
}
8692
if (parser_it != properties.end() && parser_it->second == INVERTED_INDEX_PARSER_IK) {
8793
return INVERTED_INDEX_PARSER_SMART;
8894
}

be/src/olap/inverted_index_parser.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ const std::string INVERTED_INDEX_PARSER_MAX_WORD = "ik_max_word";
6868
const std::string INVERTED_INDEX_PARSER_SMART = "ik_smart";
6969

7070
const std::string INVERTED_INDEX_PARSER_KEY = "parser";
71+
const std::string INVERTED_INDEX_PARSER_KEY_ALIAS = "built_in_analyzer";
7172
const std::string INVERTED_INDEX_PARSER_UNKNOWN = "unknown";
7273
const std::string INVERTED_INDEX_PARSER_NONE = "none";
7374
const std::string INVERTED_INDEX_PARSER_STANDARD = "standard";
@@ -85,6 +86,7 @@ const std::string INVERTED_INDEX_PARSER_PHRASE_SUPPORT_NO = "false";
8586
const std::string INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE = "char_filter_type";
8687
const std::string INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN = "char_filter_pattern";
8788
const std::string INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT = "char_filter_replacement";
89+
const std::string INVERTED_INDEX_CHAR_FILTER_CHAR_REPLACE = "char_replace";
8890

8991
const std::string INVERTED_INDEX_PARSER_IGNORE_ABOVE_KEY = "ignore_above";
9092
const std::string INVERTED_INDEX_PARSER_IGNORE_ABOVE_VALUE = "256";

be/src/olap/rowset/segment_v2/inverted_index/analysis_factory_mgr.cpp

Lines changed: 46 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,16 @@
1717

1818
#include "analysis_factory_mgr.h"
1919

20+
#include "olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter_factory.h"
21+
#include "olap/rowset/segment_v2/inverted_index/char_filter/empty_char_filter_factory.h"
2022
#include "olap/rowset/segment_v2/inverted_index/token_filter/ascii_folding_filter_factory.h"
23+
#include "olap/rowset/segment_v2/inverted_index/token_filter/empty_token_filter_factory.h"
2124
#include "olap/rowset/segment_v2/inverted_index/token_filter/lower_case_filter_factory.h"
2225
#include "olap/rowset/segment_v2/inverted_index/token_filter/word_delimiter_filter_factory.h"
26+
#include "olap/rowset/segment_v2/inverted_index/tokenizer/basic/basic_tokenizer_factory.h"
2327
#include "olap/rowset/segment_v2/inverted_index/tokenizer/char/char_group_tokenizer_factory.h"
28+
#include "olap/rowset/segment_v2/inverted_index/tokenizer/empty/empty_tokenizer_factory.h"
29+
#include "olap/rowset/segment_v2/inverted_index/tokenizer/icu/icu_tokenizer_factory.h"
2430
#include "olap/rowset/segment_v2/inverted_index/tokenizer/keyword/keyword_tokenizer_factory.h"
2531
#include "olap/rowset/segment_v2/inverted_index/tokenizer/ngram/edge_ngram_tokenizer_factory.h"
2632
#include "olap/rowset/segment_v2/inverted_index/tokenizer/standard/standard_tokenizer_factory.h"
@@ -30,26 +36,46 @@ namespace doris::segment_v2::inverted_index {
3036
void AnalysisFactoryMgr::initialise() {
3137
static std::once_flag once_flag;
3238
std::call_once(once_flag, [this]() {
39+
// char_filter
40+
registerFactory<CharFilterFactory>(
41+
"empty", []() { return std::make_shared<EmptyCharFilterFactory>(); });
42+
registerFactory<CharFilterFactory>(
43+
"char_replace", []() { return std::make_shared<CharReplaceCharFilterFactory>(); });
44+
3345
// tokenizer
34-
registerFactory("standard", []() { return std::make_shared<StandardTokenizerFactory>(); });
35-
registerFactory("keyword", []() { return std::make_shared<KeywordTokenizerFactory>(); });
36-
registerFactory("ngram", []() { return std::make_shared<NGramTokenizerFactory>(); });
37-
registerFactory("edge_ngram",
38-
[]() { return std::make_shared<EdgeNGramTokenizerFactory>(); });
39-
registerFactory("char_group",
40-
[]() { return std::make_shared<CharGroupTokenizerFactory>(); });
46+
registerFactory<TokenizerFactory>(
47+
"empty", []() { return std::make_shared<EmptyTokenizerFactory>(); });
48+
registerFactory<TokenizerFactory>(
49+
"standard", []() { return std::make_shared<StandardTokenizerFactory>(); });
50+
registerFactory<TokenizerFactory>(
51+
"keyword", []() { return std::make_shared<KeywordTokenizerFactory>(); });
52+
registerFactory<TokenizerFactory>(
53+
"ngram", []() { return std::make_shared<NGramTokenizerFactory>(); });
54+
registerFactory<TokenizerFactory>(
55+
"edge_ngram", []() { return std::make_shared<EdgeNGramTokenizerFactory>(); });
56+
registerFactory<TokenizerFactory>(
57+
"char_group", []() { return std::make_shared<CharGroupTokenizerFactory>(); });
58+
registerFactory<TokenizerFactory>(
59+
"basic", []() { return std::make_shared<BasicTokenizerFactory>(); });
60+
registerFactory<TokenizerFactory>("icu",
61+
[]() { return std::make_shared<ICUTokenizerFactory>(); });
4162

4263
// token_filter
43-
registerFactory("lowercase", []() { return std::make_shared<LowerCaseFilterFactory>(); });
44-
registerFactory("asciifolding",
45-
[]() { return std::make_shared<ASCIIFoldingFilterFactory>(); });
46-
registerFactory("word_delimiter",
47-
[]() { return std::make_shared<WordDelimiterFilterFactory>(); });
64+
registerFactory<TokenFilterFactory>(
65+
"empty", []() { return std::make_shared<EmptyTokenFilterFactory>(); });
66+
registerFactory<TokenFilterFactory>(
67+
"lowercase", []() { return std::make_shared<LowerCaseFilterFactory>(); });
68+
registerFactory<TokenFilterFactory>(
69+
"asciifolding", []() { return std::make_shared<ASCIIFoldingFilterFactory>(); });
70+
registerFactory<TokenFilterFactory>(
71+
"word_delimiter", []() { return std::make_shared<WordDelimiterFilterFactory>(); });
4872
});
4973
}
5074

75+
template <typename FactoryType>
5176
void AnalysisFactoryMgr::registerFactory(const std::string& name, FactoryCreator creator) {
52-
registry_[name] = std::move(creator);
77+
RegistryKey key = {std::type_index(typeid(FactoryType)), name};
78+
registry_[key] = std::move(creator);
5379
}
5480

5581
template <typename FactoryType>
@@ -59,9 +85,11 @@ std::shared_ptr<FactoryType> AnalysisFactoryMgr::create(const std::string& name,
5985
initialise();
6086
}
6187

62-
auto it = registry_.find(name);
88+
RegistryKey key = {std::type_index(typeid(FactoryType)), name};
89+
auto it = registry_.find(key);
6390
if (it == registry_.end()) {
64-
throw Exception(ErrorCode::INVALID_ARGUMENT, "Unknown factory name: {}", name);
91+
throw Exception(ErrorCode::INVALID_ARGUMENT, "Unknown factory name: {} for type: {}", name,
92+
typeid(FactoryType).name());
6593
}
6694

6795
auto factory = std::static_pointer_cast<FactoryType>(it->second());
@@ -75,4 +103,7 @@ template std::shared_ptr<TokenizerFactory> AnalysisFactoryMgr::create<TokenizerF
75103
template std::shared_ptr<TokenFilterFactory> AnalysisFactoryMgr::create<TokenFilterFactory>(
76104
const std::string&, const Settings&);
77105

106+
template std::shared_ptr<CharFilterFactory> AnalysisFactoryMgr::create<CharFilterFactory>(
107+
const std::string&, const Settings&);
108+
78109
} // namespace doris::segment_v2::inverted_index

be/src/olap/rowset/segment_v2/inverted_index/analysis_factory_mgr.h

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,8 @@
1717

1818
#pragma once
1919

20-
#include "common/exception.h"
20+
#include <typeindex>
21+
2122
#include "olap/rowset/segment_v2/inverted_index/abstract_analysis_factory.h"
2223
#include "olap/rowset/segment_v2/inverted_index/setting.h"
2324

@@ -26,6 +27,7 @@ namespace doris::segment_v2::inverted_index {
2627
class AnalysisFactoryMgr {
2728
public:
2829
using FactoryCreator = std::function<AbstractAnalysisFactoryPtr()>;
30+
using RegistryKey = std::pair<std::type_index, std::string>;
2931

3032
AnalysisFactoryMgr(const AnalysisFactoryMgr&) = delete;
3133
AnalysisFactoryMgr& operator=(const AnalysisFactoryMgr&) = delete;
@@ -36,6 +38,8 @@ class AnalysisFactoryMgr {
3638
}
3739

3840
void initialise();
41+
42+
template <typename FactoryType>
3943
void registerFactory(const std::string& name, FactoryCreator creator);
4044

4145
template <typename FactoryType>
@@ -45,7 +49,7 @@ class AnalysisFactoryMgr {
4549
AnalysisFactoryMgr() = default;
4650
~AnalysisFactoryMgr() = default;
4751

48-
std::map<std::string, FactoryCreator> registry_;
52+
std::map<RegistryKey, FactoryCreator> registry_;
4953
};
5054

5155
} // namespace doris::segment_v2::inverted_index

be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.cpp

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -35,23 +35,22 @@
3535
#include "olap/rowset/segment_v2/inverted_index/analyzer/basic/basic_analyzer.h"
3636
#include "olap/rowset/segment_v2/inverted_index/analyzer/icu/icu_analyzer.h"
3737
#include "olap/rowset/segment_v2/inverted_index/analyzer/ik/IKAnalyzer.h"
38-
#include "olap/rowset/segment_v2/inverted_index/char_filter/char_filter_factory.h"
38+
#include "olap/rowset/segment_v2/inverted_index/char_filter/char_replace_char_filter_factory.h"
3939
#include "runtime/exec_env.h"
4040
#include "runtime/index_policy/index_policy_mgr.h"
41-
#include "util/runtime_profile.h"
4241

4342
namespace doris::segment_v2::inverted_index {
4443
#include "common/compile_check_begin.h"
4544

46-
std::unique_ptr<lucene::util::Reader> InvertedIndexAnalyzer::create_reader(
47-
CharFilterMap& char_filter_map) {
48-
std::unique_ptr<lucene::util::Reader> reader =
49-
std::make_unique<lucene::util::SStringReader<char>>();
45+
ReaderPtr InvertedIndexAnalyzer::create_reader(CharFilterMap& char_filter_map) {
46+
ReaderPtr reader = std::make_shared<lucene::util::SStringReader<char>>();
5047
if (!char_filter_map.empty()) {
51-
reader = std::unique_ptr<lucene::util::Reader>(CharFilterFactory::create(
52-
char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE], reader.release(),
53-
char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN],
54-
char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT]));
48+
if (char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_TYPE] ==
49+
INVERTED_INDEX_CHAR_FILTER_CHAR_REPLACE) {
50+
reader = std::make_shared<CharReplaceCharFilter>(
51+
reader, char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_PATTERN],
52+
char_filter_map[INVERTED_INDEX_PARSER_CHAR_FILTER_REPLACEMENT]);
53+
}
5554
}
5655
return reader;
5756
}
@@ -122,7 +121,7 @@ std::shared_ptr<lucene::analysis::Analyzer> InvertedIndexAnalyzer::create_analyz
122121
}
123122

124123
std::vector<TermInfo> InvertedIndexAnalyzer::get_analyse_result(
125-
lucene::util::Reader* reader, lucene::analysis::Analyzer* analyzer) {
124+
ReaderPtr reader, lucene::analysis::Analyzer* analyzer) {
126125
std::vector<TermInfo> analyse_result;
127126

128127
std::unique_ptr<lucene::analysis::TokenStream> token_stream(analyzer->tokenStream(L"", reader));
@@ -161,7 +160,7 @@ std::vector<TermInfo> InvertedIndexAnalyzer::get_analyse_result(
161160
inverted_index_ctx->analyzer = analyzer.get();
162161
auto reader = create_reader(inverted_index_ctx->char_filter_map);
163162
reader->init(search_str.data(), static_cast<int32_t>(search_str.size()), true);
164-
return get_analyse_result(reader.get(), analyzer.get());
163+
return get_analyse_result(reader, analyzer.get());
165164
}
166165

167166
bool InvertedIndexAnalyzer::should_analyzer(const std::map<std::string, std::string>& properties) {

be/src/olap/rowset/segment_v2/inverted_index/analyzer/analyzer.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
#include "olap/inverted_index_parser.h"
2424
#include "olap/olap_common.h"
2525
#include "olap/rowset/segment_v2/inverted_index/query/query.h"
26+
#include "olap/rowset/segment_v2/inverted_index/util/reader.h"
2627
#include "olap/rowset/segment_v2/inverted_index_query_type.h"
2728

2829
namespace lucene {
@@ -38,12 +39,12 @@ namespace doris::segment_v2::inverted_index {
3839

3940
class InvertedIndexAnalyzer {
4041
public:
41-
static std::unique_ptr<lucene::util::Reader> create_reader(CharFilterMap& char_filter_map);
42+
static ReaderPtr create_reader(CharFilterMap& char_filter_map);
4243

4344
static std::shared_ptr<lucene::analysis::Analyzer> create_analyzer(
4445
const InvertedIndexCtx* inverted_index_ctx);
4546

46-
static std::vector<TermInfo> get_analyse_result(lucene::util::Reader* reader,
47+
static std::vector<TermInfo> get_analyse_result(ReaderPtr reader,
4748
lucene::analysis::Analyzer* analyzer);
4849

4950
static std::vector<TermInfo> get_analyse_result(

be/src/olap/rowset/segment_v2/inverted_index/analyzer/basic/basic_analyzer.h

Lines changed: 36 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,9 @@
1717

1818
#pragma once
1919

20-
#include <memory>
21-
22-
#include "basic_tokenizer.h"
20+
#include "olap/rowset/segment_v2/inverted_index/token_filter/lower_case_filter.h"
21+
#include "olap/rowset/segment_v2/inverted_index/token_stream.h"
22+
#include "olap/rowset/segment_v2/inverted_index/tokenizer/basic/basic_tokenizer.h"
2323

2424
namespace doris::segment_v2 {
2525

@@ -35,22 +35,47 @@ class BasicAnalyzer : public Analyzer {
3535
bool isSDocOpt() override { return true; }
3636

3737
TokenStream* tokenStream(const TCHAR* fieldName, lucene::util::Reader* reader) override {
38-
auto* tokenizer = _CLNEW BasicTokenizer(_lowercase, _ownReader);
39-
tokenizer->reset(reader);
40-
return (TokenStream*)tokenizer;
38+
throw Exception(ErrorCode::INVERTED_INDEX_NOT_SUPPORTED,
39+
"BasicAnalyzer::tokenStream not supported");
4140
}
4241

4342
TokenStream* reusableTokenStream(const TCHAR* fieldName,
4443
lucene::util::Reader* reader) override {
45-
if (_tokenizer == nullptr) {
46-
_tokenizer = std::make_unique<BasicTokenizer>(_lowercase, _ownReader);
44+
throw Exception(ErrorCode::INVERTED_INDEX_NOT_SUPPORTED,
45+
"BasicAnalyzer::reusableTokenStream not supported");
46+
}
47+
48+
TokenStream* tokenStream(const TCHAR* fieldName,
49+
const inverted_index::ReaderPtr& reader) override {
50+
auto token_stream = create_components();
51+
token_stream->set_reader(reader);
52+
token_stream->get_token_stream()->reset();
53+
return new inverted_index::TokenStreamWrapper(token_stream->get_token_stream());
54+
}
55+
56+
TokenStream* reusableTokenStream(const TCHAR* fieldName,
57+
const inverted_index::ReaderPtr& reader) override {
58+
if (_reuse_token_stream == nullptr) {
59+
_reuse_token_stream = create_components();
4760
}
48-
_tokenizer->reset(reader);
49-
return (TokenStream*)_tokenizer.get();
61+
_reuse_token_stream->set_reader(reader);
62+
return _reuse_token_stream->get_token_stream().get();
5063
};
5164

5265
private:
53-
std::unique_ptr<BasicTokenizer> _tokenizer;
66+
inverted_index::TokenStreamComponentsPtr create_components() {
67+
auto tk = std::make_shared<inverted_index::BasicTokenizer>();
68+
tk->initialize();
69+
inverted_index::TokenStreamPtr ts = tk;
70+
if (_lowercase) {
71+
auto lower_case_filter = std::make_shared<inverted_index::LowerCaseFilter>(tk);
72+
lower_case_filter->initialize();
73+
ts = lower_case_filter;
74+
}
75+
return std::make_shared<inverted_index::TokenStreamComponents>(tk, ts);
76+
}
77+
78+
inverted_index::TokenStreamComponentsPtr _reuse_token_stream;
5479
};
5580

5681
} // namespace doris::segment_v2

0 commit comments

Comments
 (0)