Skip to content

Commit 53c4f52

Browse files
authored
feat: Add support for escaping characters in KQL key names. (#560)
1 parent 2dfcd8a commit 53c4f52

File tree

8 files changed

+95
-17
lines changed

8 files changed

+95
-17
lines changed

components/core/src/clp_s/JsonParser.cpp

+6-1
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,12 @@ JsonParser::JsonParser(JsonParserOption const& option)
2121
}
2222

2323
if (false == m_timestamp_key.empty()) {
24-
clp_s::StringUtils::tokenize_column_descriptor(m_timestamp_key, m_timestamp_column);
24+
if (false
25+
== clp_s::StringUtils::tokenize_column_descriptor(m_timestamp_key, m_timestamp_column))
26+
{
27+
SPDLOG_ERROR("Can not parse invalid timestamp key: \"{}\"", m_timestamp_key);
28+
throw OperationFailed(ErrorCodeBadParam, __FILENAME__, __LINE__);
29+
}
2530
}
2631

2732
for (auto& file_path : option.file_paths) {

components/core/src/clp_s/TimestampDictionaryReader.cpp

+3-1
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,9 @@ void TimestampDictionaryReader::read_new_entries() {
4444
TimestampEntry entry;
4545
std::vector<std::string> tokens;
4646
entry.try_read_from_file(m_dictionary_decompressor);
47-
StringUtils::tokenize_column_descriptor(entry.get_key_name(), tokens);
47+
if (false == StringUtils::tokenize_column_descriptor(entry.get_key_name(), tokens)) {
48+
throw OperationFailed(ErrorCodeCorrupt, __FILENAME__, __LINE__);
49+
}
4850
m_entries.emplace_back(std::move(entry));
4951

5052
// TODO: Currently, we only allow a single authoritative timestamp column at ingestion time,

components/core/src/clp_s/Utils.cpp

+25-9
Original file line numberDiff line numberDiff line change
@@ -427,18 +427,34 @@ bool StringUtils::convert_string_to_double(std::string const& raw, double& conve
427427
return true;
428428
}
429429

430-
void StringUtils::tokenize_column_descriptor(
430+
bool StringUtils::tokenize_column_descriptor(
431431
std::string const& descriptor,
432432
std::vector<std::string>& tokens
433433
) {
434-
// TODO: handle escaped . correctly
435-
auto start = 0U;
436-
auto end = descriptor.find('.');
437-
while (end != std::string::npos) {
438-
tokens.push_back(descriptor.substr(start, end - start));
439-
start = end + 1;
440-
end = descriptor.find('.', start);
434+
// TODO: add support for unicode sequences e.g. \u263A
435+
std::string cur_tok;
436+
for (size_t cur = 0; cur < descriptor.size(); ++cur) {
437+
if ('\\' == descriptor[cur]) {
438+
++cur;
439+
if (cur >= descriptor.size()) {
440+
return false;
441+
}
442+
} else if ('.' == descriptor[cur]) {
443+
if (cur_tok.empty()) {
444+
return false;
445+
}
446+
tokens.push_back(cur_tok);
447+
cur_tok.clear();
448+
continue;
449+
}
450+
cur_tok.push_back(descriptor[cur]);
441451
}
442-
tokens.push_back(descriptor.substr(start));
452+
453+
if (cur_tok.empty()) {
454+
return false;
455+
}
456+
457+
tokens.push_back(cur_tok);
458+
return true;
443459
}
444460
} // namespace clp_s

components/core/src/clp_s/Utils.hpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -211,9 +211,9 @@ class StringUtils {
211211
* Converts a string column descriptor delimited by '.' into a list of tokens
212212
* @param descriptor
213213
* @param tokens
214-
* @return the list of tokens pushed into the 'tokens' parameter
214+
* @return true if the descriptor was tokenized successfully, false otherwise
215215
*/
216-
static void
216+
[[nodiscard]] static bool
217217
tokenize_column_descriptor(std::string const& descriptor, std::vector<std::string>& tokens);
218218

219219
private:

components/core/src/clp_s/clp-s.cpp

+4-1
Original file line numberDiff line numberDiff line change
@@ -191,7 +191,10 @@ bool search_archive(
191191
try {
192192
for (auto const& column : command_line_arguments.get_projection_columns()) {
193193
std::vector<std::string> descriptor_tokens;
194-
StringUtils::tokenize_column_descriptor(column, descriptor_tokens);
194+
if (false == StringUtils::tokenize_column_descriptor(column, descriptor_tokens)) {
195+
SPDLOG_ERROR("Can not tokenize invalid column: \"{}\"", column);
196+
return false;
197+
}
195198
projection->add_column(ColumnDescriptor::create(descriptor_tokens));
196199
}
197200
} catch (clp_s::TraceableException& e) {

components/core/src/clp_s/search/kql/Kql.g4

+1-1
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ fragment ESCAPED_SPACE
9696
;
9797
9898
fragment SPECIAL_CHARACTER
99-
: [\\():<>"*?{}]
99+
: [\\():<>"*?{}.]
100100
;
101101

102102

components/core/src/clp_s/search/kql/kql.cpp

+9-2
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,10 @@ class ParseTreeVisitor : public KqlBaseVisitor {
112112
std::string column = unquote_string(ctx->LITERAL()->getText());
113113

114114
std::vector<std::string> descriptor_tokens;
115-
StringUtils::tokenize_column_descriptor(column, descriptor_tokens);
115+
if (false == StringUtils::tokenize_column_descriptor(column, descriptor_tokens)) {
116+
SPDLOG_ERROR("Can not tokenize invalid column: \"{}\"", column);
117+
return nullptr;
118+
}
116119

117120
return ColumnDescriptor::create(descriptor_tokens);
118121
}
@@ -248,6 +251,10 @@ std::shared_ptr<Expression> parse_kql_expression(std::istream& in) {
248251
}
249252

250253
ParseTreeVisitor visitor;
251-
return std::any_cast<std::shared_ptr<Expression>>(visitor.visitStart(tree));
254+
try {
255+
return std::any_cast<std::shared_ptr<Expression>>(visitor.visitStart(tree));
256+
} catch (std::exception& e) {
257+
return {};
258+
}
252259
}
253260
} // namespace clp_s::search::kql

components/core/tests/test-kql.cpp

+45
Original file line numberDiff line numberDiff line change
@@ -187,4 +187,49 @@ TEST_CASE("Test parsing KQL", "[KQL]") {
187187
auto failure = parse_kql_expression(incorrect_query);
188188
REQUIRE(nullptr == failure);
189189
}
190+
191+
SECTION("Escape sequences in column name") {
192+
auto query = GENERATE(
193+
"a\\.b.c: *",
194+
"\"a\\.b.c\": *",
195+
"a\\.b: {c: *}",
196+
"\"a\\.b\": {\"c\": *}"
197+
);
198+
stringstream escaped_column_query{query};
199+
auto filter
200+
= std::dynamic_pointer_cast<FilterExpr>(parse_kql_expression(escaped_column_query));
201+
REQUIRE(nullptr != filter);
202+
REQUIRE(nullptr != filter->get_operand());
203+
REQUIRE(nullptr != filter->get_column());
204+
REQUIRE(false == filter->has_only_expression_operands());
205+
REQUIRE(false == filter->is_inverted());
206+
REQUIRE(FilterOperation::EQ == filter->get_operation());
207+
REQUIRE(2 == filter->get_column()->get_descriptor_list().size());
208+
auto it = filter->get_column()->descriptor_begin();
209+
REQUIRE(DescriptorToken{"a.b"} == *it++);
210+
REQUIRE(DescriptorToken{"c"} == *it++);
211+
}
212+
213+
SECTION("Illegal escape sequences in column name") {
214+
auto query = GENERATE(
215+
//"a\\:*", this case is technically legal since ':' gets escaped
216+
"\"a\\\":*",
217+
"a\\ :*",
218+
"\"a\\\" :*",
219+
"a.:*",
220+
"\"a.\":*",
221+
"a. :*",
222+
"\"a.\" :*"
223+
);
224+
stringstream illegal_escape{query};
225+
auto filter = parse_kql_expression(illegal_escape);
226+
REQUIRE(nullptr == filter);
227+
}
228+
229+
SECTION("Empty token in column name") {
230+
auto query = GENERATE(".a:*", "a.:*", "a..c:*", "a.b.:*");
231+
stringstream empty_token_column{query};
232+
auto filter = parse_kql_expression(empty_token_column);
233+
REQUIRE(nullptr == filter);
234+
}
190235
}

0 commit comments

Comments
 (0)