Skip to content

Commit 26cfe53

Browse files
authored
mango: add $beginsWith operator (#4810)
Adds a `$beginsWith` operator to selectors, with json and text index support. This is a compliment / precursor to optimising `$regex` support as proposed in #4776. For `json` indexes, a $beginsWith operator translates into a key range query, as is common practice for _view queries. For example, to find all rows with a key beginning with "W", we can use a range `start_key="W", end_key="W\ufff0"`. Given Mango uses compound keys, this is slightly more complex in practice, but the idea is the same. As with other range operators (`$gt`, `$gte`, etc), `$beginsWith` can be used in combination with equality operators and result sorting but must result in a contiguous key range. That is, a range of `start_key=[10, "W"], end_key=[10, "W\ufff0", {}]` would be valid, but `start_key=["W", 10], end_key=["W\ufff0", 10, {}]` would not, because the second element of the key may result in a non-contiguous range. For text indexes, `$beginsWith` translates to a Lucene query on the specified field of `W*`. If a non-string operand is provided to `$beginsWith`, the request will fail with a 400 / `invalid_operator` error.
1 parent 682f512 commit 26cfe53

File tree

6 files changed

+312
-91
lines changed

6 files changed

+312
-91
lines changed

Diff for: src/docs/src/api/database/find.rst

+77-67
Original file line numberDiff line numberDiff line change
@@ -200,8 +200,9 @@ A simple selector, inspecting specific fields:
200200
201201
You can create more complex selector expressions by combining operators.
202202
For best performance, it is best to combine 'combination' or
203-
'array logical' operators, such as ``$regex``, with an equality
204-
operators such as ``$eq``, ``$gt``, ``$gte``, ``$lt``, and ``$lte``
203+
'array logical' operators, such as ``$regex``, with an operator
204+
that defines a contiguous range of keys such as ``$eq``,
205+
``$gt``, ``$gte``, ``$lt``, ``$lte``, and ``$beginsWith``
205206
(but not ``$ne``). For more information about creating complex
206207
selector expressions, see :ref:`creating selector expressions
207208
<find/expressions>`.
@@ -673,68 +674,74 @@ In addition, some 'meta' condition operators are available. Some condition
673674
operators accept any valid JSON content as the argument. Other condition
674675
operators require the argument to be in a specific JSON format.
675676

676-
+---------------+-------------+------------+-----------------------------------+
677-
| Operator type | Operator | Argument | Purpose |
678-
+===============+=============+============+===================================+
679-
| (In)equality | ``$lt`` | Any JSON | The field is less than the |
680-
| | | | argument. |
681-
+---------------+-------------+------------+-----------------------------------+
682-
| | ``$lte`` | Any JSON | The field is less than or equal to|
683-
| | | | the argument. |
684-
+---------------+-------------+------------+-----------------------------------+
685-
| | ``$eq`` | Any JSON | The field is equal to the argument|
686-
+---------------+-------------+------------+-----------------------------------+
687-
| | ``$ne`` | Any JSON | The field is not equal to the |
688-
| | | | argument. |
689-
+---------------+-------------+------------+-----------------------------------+
690-
| | ``$gte`` | Any JSON | The field is greater than or equal|
691-
| | | | to the argument. |
692-
+---------------+-------------+------------+-----------------------------------+
693-
| | ``$gt`` | Any JSON | The field is greater than the |
694-
| | | | to the argument. |
695-
+---------------+-------------+------------+-----------------------------------+
696-
| Object | ``$exists`` | Boolean | Check whether the field exists or |
697-
| | | | not, regardless of its value. |
698-
+---------------+-------------+------------+-----------------------------------+
699-
| | ``$type`` | String | Check the document field's type. |
700-
| | | | Valid values are ``"null"``, |
701-
| | | | ``"boolean"``, ``"number"``, |
702-
| | | | ``"string"``, ``"array"``, and |
703-
| | | | ``"object"``. |
704-
+---------------+-------------+------------+-----------------------------------+
705-
| Array | ``$in`` | Array of | The document field must exist in |
706-
| | | JSON values| the list provided. |
707-
+---------------+-------------+------------+-----------------------------------+
708-
| | ``$nin`` | Array of | The document field not must exist |
709-
| | | JSON values| in the list provided. |
710-
+---------------+-------------+------------+-----------------------------------+
711-
| | ``$size`` | Integer | Special condition to match the |
712-
| | | | length of an array field in a |
713-
| | | | document. Non-array fields cannot |
714-
| | | | match this condition. |
715-
+---------------+-------------+------------+-----------------------------------+
716-
| Miscellaneous | ``$mod`` | [Divisor, | Divisor is a non-zero integer, |
717-
| | | Remainder] | Remainder is any integer. |
718-
| | | | Non-integer values result in a |
719-
| | | | 404. Matches documents where |
720-
| | | | ``field % Divisor == Remainder`` |
721-
| | | | is true, and only when the |
722-
| | | | document field is an integer. |
723-
+---------------+-------------+------------+-----------------------------------+
724-
| | ``$regex`` | String | A regular expression pattern to |
725-
| | | | match against the document field. |
726-
| | | | Only matches when the field is a |
727-
| | | | string value and matches the |
728-
| | | | supplied regular expression. The |
729-
| | | | matching algorithms are based on |
730-
| | | | the Perl Compatible Regular |
731-
| | | | Expression (PCRE) library. For |
732-
| | | | more information about what is |
733-
| | | | implemented, see the see the |
734-
| | | | `Erlang Regular Expression |
735-
| | | | <http://erlang.org/doc |
736-
| | | | /man/re.html>`_. |
737-
+---------------+-------------+------------+-----------------------------------+
677+
+---------------+-----------------+-------------+------------------------------------+
678+
| Operator type | Operator | Argument | Purpose |
679+
+===============+=================+=============+====================================+
680+
| (In)equality | ``$lt`` | Any JSON | The field is less than the |
681+
| | | | argument. |
682+
+---------------+-----------------+-------------+------------------------------------+
683+
| | ``$lte`` | Any JSON | The field is less than or equal to |
684+
| | | | the argument. |
685+
+---------------+-----------------+-------------+------------------------------------+
686+
| | ``$eq`` | Any JSON | The field is equal to the argument |
687+
+---------------+-----------------+-------------+------------------------------------+
688+
| | ``$ne`` | Any JSON | The field is not equal to the |
689+
| | | | argument. |
690+
+---------------+-----------------+-------------+------------------------------------+
691+
| | ``$gte`` | Any JSON | The field is greater than or equal |
692+
| | | | to the argument. |
693+
+---------------+-----------------+-------------+------------------------------------+
694+
| | ``$gt`` | Any JSON | The field is greater than the |
695+
| | | | to the argument. |
696+
+---------------+-----------------+-------------+------------------------------------+
697+
| Object | ``$exists`` | Boolean | Check whether the field exists or |
698+
| | | | not, regardless of its value. |
699+
+---------------+-----------------+-------------+------------------------------------+
700+
| | ``$type`` | String | Check the document field's type. |
701+
| | | | Valid values are ``"null"``, |
702+
| | | | ``"boolean"``, ``"number"``, |
703+
| | | | ``"string"``, ``"array"``, and |
704+
| | | | ``"object"``. |
705+
+---------------+-----------------+-------------+------------------------------------+
706+
| Array | ``$in`` | Array of | The document field must exist in |
707+
| | | JSON values | the list provided. |
708+
+---------------+-----------------+-------------+------------------------------------+
709+
| | ``$nin`` | Array of | The document field not must exist |
710+
| | | JSON values | in the list provided. |
711+
+---------------+-----------------+-------------+------------------------------------+
712+
| | ``$size`` | Integer | Special condition to match the |
713+
| | | | length of an array field in a |
714+
| | | | document. Non-array fields cannot |
715+
| | | | match this condition. |
716+
+---------------+-----------------+-------------+------------------------------------+
717+
| Miscellaneous | ``$mod`` | [Divisor, | Divisor is a non-zero integer, |
718+
| | | Remainder] | Remainder is any integer. |
719+
| | | | Non-integer values result in a |
720+
| | | | 404. Matches documents where |
721+
| | | | ``field % Divisor == Remainder`` |
722+
| | | | is true, and only when the |
723+
| | | | document field is an integer. |
724+
+---------------+-----------------+-------------+------------------------------------+
725+
| | ``$regex`` | String | A regular expression pattern to |
726+
| | | | match against the document field. |
727+
| | | | Only matches when the field is a |
728+
| | | | string value and matches the |
729+
| | | | supplied regular expression. The |
730+
| | | | matching algorithms are based on |
731+
| | | | the Perl Compatible Regular |
732+
| | | | Expression (PCRE) library. For |
733+
| | | | more information about what is |
734+
| | | | implemented, see the see the |
735+
| | | | `Erlang Regular Expression |
736+
| | | | <http://erlang.org/doc |
737+
| | | | /man/re.html>`_. |
738+
+---------------+-----------------+-------------+------------------------------------+
739+
| | ``$beginsWith`` | String | Matches where the document field |
740+
| | | | begins with the specified prefix |
741+
| | | | (case-sensitive). If the document |
742+
| | | | field contains a non-string value, |
743+
| | | | the document is not matched. |
744+
+---------------+-----------------+-------------+------------------------------------+
738745

739746
.. warning::
740747
Regular expressions do not work with indexes, so they should not be used to
@@ -753,9 +760,12 @@ In general, whenever you have an operator that takes an argument, that argument
753760
can itself be another operator with arguments of its own. This enables us to
754761
build up more complex selector expressions.
755762

756-
However, only equality operators such as ``$eq``, ``$gt``, ``$gte``, ``$lt``,
757-
and ``$lte`` (but not ``$ne``) can be used as the basis of a query. You should
758-
include at least one of these in a selector.
763+
However, only operators that define a contiguous range of values
764+
such as ``$eq``, ``$gt``, ``$gte``, ``$lt``, ``$lte``,
765+
and ``$beginsWith`` (but not ``$ne``) can be used as the basis
766+
of a query that can make efficient use of a ``json`` index. You should
767+
include at least one of these in a selector, or consider using
768+
a ``text`` index if greater flexibility is required.
759769

760770
For example, if you try to perform a query that attempts to match all documents
761771
that have a field called `afieldname` containing a value that begins with the

Diff for: src/mango/src/mango_idx_view.erl

+6
Original file line numberDiff line numberDiff line change
@@ -306,6 +306,8 @@ indexable({[{<<"$gt">>, _}]}) ->
306306
true;
307307
indexable({[{<<"$gte">>, _}]}) ->
308308
true;
309+
indexable({[{<<"$beginsWith">>, _}]}) ->
310+
true;
309311
% This is required to improve index selection for covering indexes.
310312
% Making `$exists` indexable should not cause problems in other cases.
311313
indexable({[{<<"$exists">>, _}]}) ->
@@ -412,6 +414,10 @@ range(_, _, LCmp, Low, HCmp, High) ->
412414
% operators but its all straight forward once you figure out how
413415
% we're basically just narrowing our logical ranges.
414416

417+
% beginsWith requires both a high and low bound
418+
range({[{<<"$beginsWith">>, Arg}]}, LCmp, Low, HCmp, High) ->
419+
{LCmp0, Low0, HCmp0, High0} = range({[{<<"$gte">>, Arg}]}, LCmp, Low, HCmp, High),
420+
range({[{<<"$lte">>, <<Arg/binary, 16#10FFFF>>}]}, LCmp0, Low0, HCmp0, High0);
415421
range({[{<<"$lt">>, Arg}]}, LCmp, Low, HCmp, High) ->
416422
case range_pos(Low, Arg, High) of
417423
min ->

Diff for: src/mango/src/mango_selector.erl

+47-23
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,8 @@ norm_ops({[{<<"$text">>, Arg}]}) when
135135
{[{<<"$default">>, {[{<<"$text">>, Arg}]}}]};
136136
norm_ops({[{<<"$text">>, Arg}]}) ->
137137
?MANGO_ERROR({bad_arg, '$text', Arg});
138+
norm_ops({[{<<"$beginsWith">>, Arg}]} = Cond) when is_binary(Arg) ->
139+
Cond;
138140
% Not technically an operator but we pass it through here
139141
% so that this function accepts its own output. This exists
140142
% so that $text can have a field name value which simplifies
@@ -514,6 +516,11 @@ match({[{<<"$mod">>, [D, R]}]}, Value, _Cmp) when is_integer(Value) ->
514516
Value rem D == R;
515517
match({[{<<"$mod">>, _}]}, _Value, _Cmp) ->
516518
false;
519+
match({[{<<"$beginsWith">>, Prefix}]}, Value, _Cmp) when is_binary(Prefix), is_binary(Value) ->
520+
string:prefix(Value, Prefix) /= nomatch;
521+
% When Value is not a string, do not match
522+
match({[{<<"$beginsWith">>, Prefix}]}, _, _Cmp) when is_binary(Prefix) ->
523+
false;
517524
match({[{<<"$regex">>, Regex}]}, Value, _Cmp) when is_binary(Value) ->
518525
try
519526
match == re:run(Value, Regex, [{capture, none}])
@@ -652,6 +659,14 @@ fields({[]}) ->
652659
-ifdef(TEST).
653660
-include_lib("eunit/include/eunit.hrl").
654661

662+
-define(TEST_DOC,
663+
{[
664+
{<<"_id">>, <<"foo">>},
665+
{<<"_rev">>, <<"bar">>},
666+
{<<"user_id">>, 11}
667+
]}
668+
).
669+
655670
is_constant_field_basic_test() ->
656671
Selector = normalize({[{<<"A">>, <<"foo">>}]}),
657672
Field = <<"A">>,
@@ -991,30 +1006,22 @@ has_required_fields_or_nested_or_false_test() ->
9911006
Normalized = normalize(Selector),
9921007
?assertEqual(false, has_required_fields(Normalized, RequiredFields)).
9931008

1009+
check_match(Selector) ->
1010+
% Call match_int/2 to avoid ERROR for missing metric; this is confusing
1011+
% in the middle of test output.
1012+
match_int(mango_selector:normalize(Selector), ?TEST_DOC).
1013+
9941014
%% This test shows the shape match/2 expects for its arguments.
995-
match_demo_test_() ->
996-
Doc =
997-
{[
998-
{<<"_id">>, <<"foo">>},
999-
{<<"_rev">>, <<"bar">>},
1000-
{<<"user_id">>, 11}
1001-
]},
1002-
Check = fun(Selector) ->
1003-
% Call match_int/2 to avoid ERROR for missing metric; this is confusing
1004-
% in the middle of test output.
1005-
match_int(mango_selector:normalize(Selector), Doc)
1006-
end,
1007-
[
1008-
% matching
1009-
?_assertEqual(true, Check({[{<<"user_id">>, 11}]})),
1010-
?_assertEqual(true, Check({[{<<"_id">>, <<"foo">>}]})),
1011-
?_assertEqual(true, Check({[{<<"_id">>, <<"foo">>}, {<<"_rev">>, <<"bar">>}]})),
1012-
% non-matching
1013-
?_assertEqual(false, Check({[{<<"user_id">>, 1234}]})),
1014-
% string 11 doesn't match number 11
1015-
?_assertEqual(false, Check({[{<<"user_id">>, <<"11">>}]})),
1016-
?_assertEqual(false, Check({[{<<"_id">>, <<"foo">>}, {<<"_rev">>, <<"quux">>}]}))
1017-
].
1015+
match_demo_test() ->
1016+
% matching
1017+
?assertEqual(true, check_match({[{<<"user_id">>, 11}]})),
1018+
?assertEqual(true, check_match({[{<<"_id">>, <<"foo">>}]})),
1019+
?assertEqual(true, check_match({[{<<"_id">>, <<"foo">>}, {<<"_rev">>, <<"bar">>}]})),
1020+
% non-matching
1021+
?assertEqual(false, check_match({[{<<"user_id">>, 1234}]})),
1022+
% string 11 doesn't match number 11
1023+
?assertEqual(false, check_match({[{<<"user_id">>, <<"11">>}]})),
1024+
?assertEqual(false, check_match({[{<<"_id">>, <<"foo">>}, {<<"_rev">>, <<"quux">>}]})).
10181025

10191026
fields_of(Selector) ->
10201027
fields(test_util:as_selector(Selector)).
@@ -1054,4 +1061,21 @@ fields_nor_test() ->
10541061
},
10551062
?assertEqual([<<"field1">>, <<"field2">>], fields_of(Selector2)).
10561063

1064+
check_beginswith(Field, Prefix) ->
1065+
Selector = {[{Field, {[{<<"$beginsWith">>, Prefix}]}}]},
1066+
% Call match_int/2 to avoid ERROR for missing metric; this is confusing
1067+
% in the middle of test output.
1068+
match_int(mango_selector:normalize(Selector), ?TEST_DOC).
1069+
1070+
match_beginswith_test() ->
1071+
% matching
1072+
?assertEqual(true, check_beginswith(<<"_id">>, <<"f">>)),
1073+
% no match (user_id is not a binary string)
1074+
?assertEqual(false, check_beginswith(<<"user_id">>, <<"f">>)),
1075+
% invalid (prefix is not a binary string)
1076+
?assertThrow(
1077+
{mango_error, mango_selector, {invalid_operator, <<"$beginsWith">>}},
1078+
check_beginswith(<<"user_id">>, 1)
1079+
).
1080+
10571081
-endif.

Diff for: src/mango/src/mango_selector_text.erl

+11
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,11 @@ convert(Path, {[{<<"$exists">>, ShouldExist}]}) ->
142142
true -> FieldExists;
143143
false -> {op_not, {FieldExists, false}}
144144
end;
145+
convert(Path, {[{<<"$beginsWith">>, Arg}]}) when is_binary(Arg) ->
146+
Prefix = mango_util:lucene_escape_query_value(Arg),
147+
Suffix = <<"*">>,
148+
PrefixSearch = <<Prefix/binary, Suffix/binary>>,
149+
{op_field, {make_field(Path, Arg), PrefixSearch}};
145150
% We're not checking the actual type here, just looking for
146151
% anything that has a possibility of matching by checking
147152
% for the field name. We use the same logic for $exists on
@@ -821,6 +826,12 @@ convert_nor_test() ->
821826
})
822827
).
823828

829+
convert_beginswith_test() ->
830+
?assertEqual(
831+
{op_field, {[[<<"field">>], <<":">>, <<"string">>], <<"foo*">>}},
832+
convert_selector(#{<<"field">> => #{<<"$beginsWith">> => <<"foo">>}})
833+
).
834+
824835
to_query_test() ->
825836
F = fun(S) -> iolist_to_binary(to_query(S)) end,
826837
Input = {<<"name">>, <<"value">>},

0 commit comments

Comments
 (0)