diff options
author | Tim Barron <tjbarron@google.com> | 2023-03-09 18:56:43 -0800 |
---|---|---|
committer | Tim Barron <tjbarron@google.com> | 2023-03-09 18:56:43 -0800 |
commit | 3fe6aa4251989fb27863fdbf51e18d8c1f9e42dd (patch) | |
tree | be7ee8f21e62dcd3c30d394aafa6c9c2d0a6a56e | |
parent | 53032446fec369125a6dc47c9f66435e4a62410b (diff) | |
download | icing-3fe6aa4251989fb27863fdbf51e18d8c1f9e42dd.tar.gz |
Update Icing from upstream.
Descriptions:
======================================================================
Integrate Advanced Query w/ Suggest, make ADVANCED_QUERY default parser.
======================================================================
BUG: 208654892
Change-Id: I53889482e844648ec65565f9a7f7c8faa89baa7c
20 files changed, 994 insertions, 507 deletions
diff --git a/icing/icing-search-engine_suggest_test.cc b/icing/icing-search-engine_suggest_test.cc index 6973ad0..b3aeafc 100644 --- a/icing/icing-search-engine_suggest_test.cc +++ b/icing/icing-search-engine_suggest_test.cc @@ -12,8 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "icing/icing-search-engine.h" - #include <cstdint> #include <limits> #include <memory> @@ -25,6 +23,7 @@ #include "gtest/gtest.h" #include "icing/document-builder.h" #include "icing/file/filesystem.h" +#include "icing/icing-search-engine.h" #include "icing/jni/jni-cache.h" #include "icing/portable/endian.h" #include "icing/portable/equals-proto.h" @@ -1508,6 +1507,95 @@ TEST_F(IcingSearchEngineSuggestTest, UnorderedElementsAre(EqualsProto(suggestionBarCatSubjectFoo))); } +TEST_F(IcingSearchEngineSuggestTest, SearchSuggestionsTest_InvalidPrefixTest) { + IcingSearchEngine icing(GetDefaultIcingOptions(), GetTestJniCache()); + ASSERT_THAT(icing.Initialize().status(), ProtoIsOk()); + ASSERT_THAT(icing.SetSchema(CreatePersonAndEmailSchema()).status(), + ProtoIsOk()); + + DocumentProto document1 = + DocumentBuilder() + .SetKey("namespace1", "uri1") + .SetSchema("Email") + .SetCreationTimestampMs(10) + .AddStringProperty("subject", "bar fo") // "bar fo" + .AddStringProperty("body", "fool") + .Build(); + DocumentProto document2 = + DocumentBuilder() + .SetKey("namespace1", "uri2") + .SetSchema("Email") + .SetCreationTimestampMs(10) + .AddStringProperty("subject", "bar cat foo") // "bar cat fool" + .AddStringProperty("body", "fool") + .Build(); + DocumentProto document3 = DocumentBuilder() + .SetKey("namespace1", "uri3") + .SetSchema("Email") + .SetCreationTimestampMs(10) + .AddStringProperty("subject", "fool") // "fool" + .AddStringProperty("body", "fool") + .Build(); + ASSERT_THAT(icing.Put(document1).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(document2).status(), ProtoIsOk()); + ASSERT_THAT(icing.Put(document3).status(), ProtoIsOk()); + + // Search for "f OR" + SuggestionSpecProto suggestion_spec; + suggestion_spec.set_prefix("f OR"); + suggestion_spec.set_num_to_return(10); + suggestion_spec.mutable_scoring_spec()->set_scoring_match_type( + TermMatchType::PREFIX); + suggestion_spec.mutable_scoring_spec()->set_rank_by( + SuggestionScoringSpecProto::SuggestionRankingStrategy::DOCUMENT_COUNT); + + SuggestionResponse response = icing.SearchSuggestions(suggestion_spec); + if (SearchSpecProto::default_instance().search_type() == + SearchSpecProto::SearchType::ICING_RAW_QUERY) { + EXPECT_THAT(response.status(), ProtoIsOk()); + EXPECT_THAT(response.suggestions(), IsEmpty()); + } else { + EXPECT_THAT(response.status(), + ProtoStatusIs(StatusProto::INVALID_ARGUMENT)); + EXPECT_THAT(response.suggestions(), IsEmpty()); + } + + // TODO(b/208654892): Update handling for hyphens to only consider it a hyphen + // within a TEXT token (rather than a MINUS token) when surrounded on both + // sides by TEXT rather than just preceded by TEXT. + // Search for "f-" + suggestion_spec.set_prefix("f-"); + response = icing.SearchSuggestions(suggestion_spec); + EXPECT_THAT(response.status(), ProtoIsOk()); + EXPECT_THAT(response.suggestions(), IsEmpty()); + + // Search for "f:" + suggestion_spec.set_prefix("f:"); + response = icing.SearchSuggestions(suggestion_spec); + if (SearchSpecProto::default_instance().search_type() == + SearchSpecProto::SearchType::ICING_RAW_QUERY) { + EXPECT_THAT(response.status(), ProtoIsOk()); + EXPECT_THAT(response.suggestions(), IsEmpty()); + } else { + EXPECT_THAT(response.status(), + ProtoStatusIs(StatusProto::INVALID_ARGUMENT)); + EXPECT_THAT(response.suggestions(), IsEmpty()); + } + + // Search for "OR OR - :" + suggestion_spec.set_prefix("OR OR - :"); + response = icing.SearchSuggestions(suggestion_spec); + if (SearchSpecProto::default_instance().search_type() == + SearchSpecProto::SearchType::ICING_RAW_QUERY) { + EXPECT_THAT(response.status(), ProtoIsOk()); + EXPECT_THAT(response.suggestions(), IsEmpty()); + } else { + EXPECT_THAT(response.status(), + ProtoStatusIs(StatusProto::INVALID_ARGUMENT)); + EXPECT_THAT(response.suggestions(), IsEmpty()); + } +} + } // namespace } // namespace lib } // namespace icing diff --git a/icing/query/advanced_query_parser/abstract-syntax-tree.h b/icing/query/advanced_query_parser/abstract-syntax-tree.h index d18f6ea..67049ad 100644 --- a/icing/query/advanced_query_parser/abstract-syntax-tree.h +++ b/icing/query/advanced_query_parser/abstract-syntax-tree.h @@ -17,6 +17,7 @@ #include <memory> #include <string> +#include <string_view> #include <utility> #include <vector> @@ -52,24 +53,29 @@ class Node { class TerminalNode : public Node { public: - explicit TerminalNode(std::string value, bool is_prefix) - : value_(std::move(value)), is_prefix_(is_prefix) {} + explicit TerminalNode(std::string value, std::string_view raw_value, + bool is_prefix) + : value_(std::move(value)), + raw_value_(raw_value), + is_prefix_(is_prefix) {} const std::string& value() const& { return value_; } std::string value() && { return std::move(value_); } bool is_prefix() const { return is_prefix_; } + std::string_view raw_value() const { return raw_value_; } + private: std::string value_; + std::string_view raw_value_; bool is_prefix_; }; class FunctionNameNode : public TerminalNode { public: explicit FunctionNameNode(std::string value) - : TerminalNode(std::move(value), /*is_prefix=*/false) {} - + : TerminalNode(std::move(value), /*raw_value=*/"", /*is_prefix=*/false) {} void Accept(AbstractSyntaxTreeVisitor* visitor) const override { visitor->VisitFunctionName(this); } @@ -77,9 +83,9 @@ class FunctionNameNode : public TerminalNode { class StringNode : public TerminalNode { public: - explicit StringNode(std::string value, bool is_prefix = false) - : TerminalNode(std::move(value), is_prefix) {} - + explicit StringNode(std::string value, std::string_view raw_value, + bool is_prefix = false) + : TerminalNode(std::move(value), raw_value, is_prefix) {} void Accept(AbstractSyntaxTreeVisitor* visitor) const override { visitor->VisitString(this); } @@ -87,9 +93,9 @@ class StringNode : public TerminalNode { class TextNode : public TerminalNode { public: - explicit TextNode(std::string value, bool is_prefix = false) - : TerminalNode(std::move(value), is_prefix) {} - + explicit TextNode(std::string value, std::string_view raw_value, + bool is_prefix = false) + : TerminalNode(std::move(value), raw_value, is_prefix) {} void Accept(AbstractSyntaxTreeVisitor* visitor) const override { visitor->VisitText(this); } diff --git a/icing/query/advanced_query_parser/abstract-syntax-tree_test.cc b/icing/query/advanced_query_parser/abstract-syntax-tree_test.cc index a8599fd..5e28278 100644 --- a/icing/query/advanced_query_parser/abstract-syntax-tree_test.cc +++ b/icing/query/advanced_query_parser/abstract-syntax-tree_test.cc @@ -27,8 +27,8 @@ namespace { using ::testing::ElementsAre; TEST(AbstractSyntaxTreeTest, Simple) { - // foo - std::unique_ptr<Node> root = std::make_unique<TextNode>("foo"); + std::string_view query = "foo"; + std::unique_ptr<Node> root = std::make_unique<TextNode>("foo", query); SimpleVisitor visitor; root->Accept(&visitor); @@ -37,16 +37,16 @@ TEST(AbstractSyntaxTreeTest, Simple) { } TEST(AbstractSyntaxTreeTest, Composite) { - // (foo bar) OR baz + std::string_view query = "(foo bar) OR baz"; std::vector<std::unique_ptr<Node>> and_args; - and_args.push_back(std::make_unique<TextNode>("foo")); - and_args.push_back(std::make_unique<TextNode>("bar")); + and_args.push_back(std::make_unique<TextNode>("foo", query.substr(1, 3))); + and_args.push_back(std::make_unique<TextNode>("bar", query.substr(5, 3))); auto and_node = std::make_unique<NaryOperatorNode>("AND", std::move(and_args)); std::vector<std::unique_ptr<Node>> or_args; or_args.push_back(std::move(and_node)); - or_args.push_back(std::make_unique<TextNode>("baz")); + or_args.push_back(std::make_unique<TextNode>("baz", query.substr(13, 3))); std::unique_ptr<Node> root = std::make_unique<NaryOperatorNode>("OR", std::move(or_args)); @@ -72,9 +72,9 @@ TEST(AbstractSyntaxTreeTest, Function) { ElementsAre(EqualsNodeInfo("foo", NodeType::kFunctionName), EqualsNodeInfo("", NodeType::kFunction))); - // foo("bar") + std::string_view query = "foo(\"bar\")"; std::vector<std::unique_ptr<Node>> args; - args.push_back(std::make_unique<StringNode>("bar")); + args.push_back(std::make_unique<StringNode>("bar", query.substr(5, 3))); root = std::make_unique<FunctionNode>( std::make_unique<FunctionNameNode>("foo"), std::move(args)); visitor = SimpleVisitor(); @@ -85,9 +85,9 @@ TEST(AbstractSyntaxTreeTest, Function) { EqualsNodeInfo("bar", NodeType::kString), EqualsNodeInfo("", NodeType::kFunction))); - // foo(bar("baz")) + query = "foo(bar(\"baz\"))"; std::vector<std::unique_ptr<Node>> inner_args; - inner_args.push_back(std::make_unique<StringNode>("baz")); + inner_args.push_back(std::make_unique<StringNode>("baz", query.substr(9, 3))); args.clear(); args.push_back(std::make_unique<FunctionNode>( std::make_unique<FunctionNameNode>("bar"), std::move(inner_args))); @@ -105,14 +105,16 @@ TEST(AbstractSyntaxTreeTest, Function) { } TEST(AbstractSyntaxTreeTest, Restriction) { - // sender.name:(IMPORTANT OR URGENT) + std::string_view query = "sender.name:(IMPORTANT OR URGENT)"; std::vector<std::unique_ptr<TextNode>> member_args; - member_args.push_back(std::make_unique<TextNode>("sender")); - member_args.push_back(std::make_unique<TextNode>("name")); + member_args.push_back( + std::make_unique<TextNode>("sender", query.substr(0, 6))); + member_args.push_back(std::make_unique<TextNode>("name", query.substr(7, 4))); std::vector<std::unique_ptr<Node>> or_args; - or_args.push_back(std::make_unique<TextNode>("IMPORTANT")); - or_args.push_back(std::make_unique<TextNode>("URGENT")); + or_args.push_back( + std::make_unique<TextNode>("IMPORTANT", query.substr(13, 9))); + or_args.push_back(std::make_unique<TextNode>("URGENT", query.substr(26, 6))); std::vector<std::unique_ptr<Node>> has_args; has_args.push_back(std::make_unique<MemberNode>(std::move(member_args), diff --git a/icing/query/advanced_query_parser/function_test.cc b/icing/query/advanced_query_parser/function_test.cc index 3b3ca40..afd4e04 100644 --- a/icing/query/advanced_query_parser/function_test.cc +++ b/icing/query/advanced_query_parser/function_test.cc @@ -63,10 +63,10 @@ TEST(FunctionTest, ParamNotWrongTypeFails) { Function function, Function::Create(/*return_type=*/DataType::kString, "foo", /*params=*/{Param(DataType::kString)}, TrivialEval())); - // foo(bar) + std::string_view query = "foo(bar)"; std::vector<PendingValue> args; args.push_back(PendingValue::CreateTextPendingValue( - QueryTerm{"bar", /*is_prefix_val=*/false})); + QueryTerm{"bar", query.substr(4, 3), /*is_prefix_val=*/false})); EXPECT_THAT(function.Eval(std::move(args)), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } @@ -77,10 +77,10 @@ TEST(FunctionTest, ParamRequiredArgSucceeds) { Function::Create(/*return_type=*/DataType::kString, "foo", /*params=*/{Param(DataType::kString)}, TrivialEval())); - // foo("bar") + std::string_view query = R"(foo("bar"))"; std::vector<PendingValue> args; args.push_back(PendingValue::CreateStringPendingValue( - QueryTerm{"bar", /*is_prefix_val=*/false})); + QueryTerm{"bar", query.substr(5, 3), /*is_prefix_val=*/false})); ICING_ASSERT_OK_AND_ASSIGN(PendingValue val, function.Eval(std::move(args))); EXPECT_THAT(val.is_placeholder(), IsTrue()); } @@ -136,19 +136,19 @@ TEST(FunctionTest, MultipleArgsTrailingOptionalSucceeds) { Param(DataType::kString, Cardinality::kOptional)}, TrivialEval())); - // foo("bar") + std::string_view query = R"(foo("bar"))"; std::vector<PendingValue> args; args.push_back(PendingValue::CreateStringPendingValue( - QueryTerm{"bar", /*is_prefix_val=*/false})); + QueryTerm{"bar", query.substr(5, 3), /*is_prefix_val=*/false})); ICING_ASSERT_OK_AND_ASSIGN(PendingValue val, function.Eval(std::move(args))); EXPECT_THAT(val.is_placeholder(), IsTrue()); - // foo("bar", "baz") + query = R"(foo("bar", "baz"))"; args = std::vector<PendingValue>(); args.push_back(PendingValue::CreateStringPendingValue( - QueryTerm{"bar", /*is_prefix_val=*/false})); + QueryTerm{"bar", query.substr(5, 3), /*is_prefix_val=*/false})); args.push_back(PendingValue::CreateStringPendingValue( - QueryTerm{"baz", /*is_prefix_val=*/false})); + QueryTerm{"baz", query.substr(12, 3), /*is_prefix_val=*/false})); ICING_ASSERT_OK_AND_ASSIGN(val, function.Eval(std::move(args))); EXPECT_THAT(val.is_placeholder(), IsTrue()); } @@ -162,30 +162,30 @@ TEST(FunctionTest, MultipleArgsTrailingVariableSucceeds) { Param(DataType::kString, Cardinality::kVariable)}, TrivialEval())); - // foo("bar") + std::string_view query = R"(foo("bar"))"; std::vector<PendingValue> args; args.push_back(PendingValue::CreateStringPendingValue( - QueryTerm{"bar", /*is_prefix_val=*/false})); + QueryTerm{"bar", query.substr(5, 3), /*is_prefix_val=*/false})); ICING_ASSERT_OK_AND_ASSIGN(PendingValue val, function.Eval(std::move(args))); EXPECT_THAT(val.is_placeholder(), IsTrue()); - // foo("bar", "baz") + query = R"(foo("bar", "baz"))"; args = std::vector<PendingValue>(); args.push_back(PendingValue::CreateStringPendingValue( - QueryTerm{"bar", /*is_prefix_val=*/false})); + QueryTerm{"bar", query.substr(5, 3), /*is_prefix_val=*/false})); args.push_back(PendingValue::CreateStringPendingValue( - QueryTerm{"baz", /*is_prefix_val=*/false})); + QueryTerm{"baz", query.substr(12, 3), /*is_prefix_val=*/false})); ICING_ASSERT_OK_AND_ASSIGN(val, function.Eval(std::move(args))); EXPECT_THAT(val.is_placeholder(), IsTrue()); - // foo("bar", "baz", "bat") + query = R"(foo("bar", "baz", "bat"))"; args = std::vector<PendingValue>(); args.push_back(PendingValue::CreateStringPendingValue( - QueryTerm{"bar", /*is_prefix_val=*/false})); + QueryTerm{"bar", query.substr(5, 3), /*is_prefix_val=*/false})); args.push_back(PendingValue::CreateStringPendingValue( - QueryTerm{"baz", /*is_prefix_val=*/false})); + QueryTerm{"baz", query.substr(12, 3), /*is_prefix_val=*/false})); args.push_back(PendingValue::CreateStringPendingValue( - QueryTerm{"bat", /*is_prefix_val=*/false})); + QueryTerm{"bat", query.substr(19, 3), /*is_prefix_val=*/false})); ICING_ASSERT_OK_AND_ASSIGN(val, function.Eval(std::move(args))); EXPECT_THAT(val.is_placeholder(), IsTrue()); } @@ -214,26 +214,26 @@ TEST(FunctionTest, MultipleArgsOptionalBeforeOptionalSucceeds) { ICING_ASSERT_OK_AND_ASSIGN(PendingValue val, function.Eval(std::move(args))); EXPECT_THAT(val.is_placeholder(), IsTrue()); - // foo("bar") + std::string_view query = R"(foo("bar"))"; args = std::vector<PendingValue>(); args.push_back(PendingValue::CreateStringPendingValue( - QueryTerm{"bar", /*is_prefix_val=*/false})); + QueryTerm{"bar", query.substr(5, 3), /*is_prefix_val=*/false})); ICING_ASSERT_OK_AND_ASSIGN(val, function.Eval(std::move(args))); EXPECT_THAT(val.is_placeholder(), IsTrue()); - // foo("bar", baz) + query = R"(foo("bar", baz))"; args = std::vector<PendingValue>(); args.push_back(PendingValue::CreateStringPendingValue( - QueryTerm{"bar", /*is_prefix_val=*/false})); + QueryTerm{"bar", query.substr(5, 3), /*is_prefix_val=*/false})); args.push_back(PendingValue::CreateTextPendingValue( - QueryTerm{"baz", /*is_prefix_val=*/false})); + QueryTerm{"baz", query.substr(11, 3), /*is_prefix_val=*/false})); ICING_ASSERT_OK_AND_ASSIGN(val, function.Eval(std::move(args))); EXPECT_THAT(val.is_placeholder(), IsTrue()); - // foo(baz) + query = R"(foo(baz))"; args = std::vector<PendingValue>(); args.push_back(PendingValue::CreateTextPendingValue( - QueryTerm{"baz", /*is_prefix_val=*/false})); + QueryTerm{"baz", query.substr(4, 3), /*is_prefix_val=*/false})); EXPECT_THAT(function.Eval(std::move(args)), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } @@ -252,46 +252,46 @@ TEST(FunctionTest, MultipleArgsOptionalBeforeVariableSucceeds) { ICING_ASSERT_OK_AND_ASSIGN(PendingValue val, function.Eval(std::move(args))); EXPECT_THAT(val.is_placeholder(), IsTrue()); - // foo("bar") + std::string_view query = R"(foo("bar"))"; args = std::vector<PendingValue>(); args.push_back(PendingValue::CreateStringPendingValue( - QueryTerm{"bar", /*is_prefix_val=*/false})); + QueryTerm{"bar", query.substr(5, 3), /*is_prefix_val=*/false})); ICING_ASSERT_OK_AND_ASSIGN(val, function.Eval(std::move(args))); EXPECT_THAT(val.is_placeholder(), IsTrue()); - // foo("bar", baz) + query = R"(foo("bar", baz))"; args = std::vector<PendingValue>(); args.push_back(PendingValue::CreateStringPendingValue( - QueryTerm{"bar", /*is_prefix_val=*/false})); + QueryTerm{"bar", query.substr(5, 3), /*is_prefix_val=*/false})); args.push_back(PendingValue::CreateTextPendingValue( - QueryTerm{"baz", /*is_prefix_val=*/false})); + QueryTerm{"baz", query.substr(11, 3), /*is_prefix_val=*/false})); ICING_ASSERT_OK_AND_ASSIGN(val, function.Eval(std::move(args))); EXPECT_THAT(val.is_placeholder(), IsTrue()); - // foo("bar", baz, bat) + query = R"(foo("bar", baz, bat))"; args = std::vector<PendingValue>(); args.push_back(PendingValue::CreateStringPendingValue( - QueryTerm{"bar", /*is_prefix_val=*/false})); + QueryTerm{"bar", query.substr(5, 3), /*is_prefix_val=*/false})); args.push_back(PendingValue::CreateTextPendingValue( - QueryTerm{"baz", /*is_prefix_val=*/false})); + QueryTerm{"baz", query.substr(11, 3), /*is_prefix_val=*/false})); args.push_back(PendingValue::CreateTextPendingValue( - QueryTerm{"bat", /*is_prefix_val=*/false})); + QueryTerm{"bat", query.substr(16, 3), /*is_prefix_val=*/false})); ICING_ASSERT_OK_AND_ASSIGN(val, function.Eval(std::move(args))); EXPECT_THAT(val.is_placeholder(), IsTrue()); - // foo(baz) + query = R"(foo(baz))"; args = std::vector<PendingValue>(); args.push_back(PendingValue::CreateTextPendingValue( - QueryTerm{"baz", /*is_prefix_val=*/false})); + QueryTerm{"baz", query.substr(4, 3), /*is_prefix_val=*/false})); EXPECT_THAT(function.Eval(std::move(args)), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); - // foo(baz, bat) + query = R"(foo(baz, bat))"; args = std::vector<PendingValue>(); args.push_back(PendingValue::CreateTextPendingValue( - QueryTerm{"baz", /*is_prefix_val=*/false})); + QueryTerm{"baz", query.substr(4, 3), /*is_prefix_val=*/false})); args.push_back(PendingValue::CreateTextPendingValue( - QueryTerm{"bat", /*is_prefix_val=*/false})); + QueryTerm{"bat", query.substr(9, 3), /*is_prefix_val=*/false})); EXPECT_THAT(function.Eval(std::move(args)), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } diff --git a/icing/query/advanced_query_parser/lexer.cc b/icing/query/advanced_query_parser/lexer.cc index 6cddd96..0dd0bb0 100644 --- a/icing/query/advanced_query_parser/lexer.cc +++ b/icing/query/advanced_query_parser/lexer.cc @@ -38,12 +38,13 @@ bool Lexer::ConsumeWhitespace() { } bool Lexer::ConsumeQuerySingleChar() { + std::string_view original_text = query_.substr(current_index_, 1); switch (current_char_) { case ':': - tokens_.push_back({":", TokenType::COMPARATOR}); + tokens_.push_back({":", original_text, TokenType::COMPARATOR}); break; case '*': - tokens_.push_back({"", TokenType::STAR}); + tokens_.push_back({"", original_text, TokenType::STAR}); break; case '-': if (in_text_) { @@ -51,7 +52,7 @@ bool Lexer::ConsumeQuerySingleChar() { // in the middle of a TEXT segment (ex. `foo-bar`). return false; } - tokens_.push_back({"", TokenType::MINUS}); + tokens_.push_back({"", original_text, TokenType::MINUS}); break; default: return false; @@ -61,18 +62,19 @@ bool Lexer::ConsumeQuerySingleChar() { } bool Lexer::ConsumeScoringSingleChar() { + std::string_view original_text = query_.substr(current_index_, 1); switch (current_char_) { case '+': - tokens_.push_back({"", TokenType::PLUS}); + tokens_.push_back({"", original_text, TokenType::PLUS}); break; case '*': - tokens_.push_back({"", TokenType::TIMES}); + tokens_.push_back({"", original_text, TokenType::TIMES}); break; case '/': - tokens_.push_back({"", TokenType::DIV}); + tokens_.push_back({"", original_text, TokenType::DIV}); break; case '-': - tokens_.push_back({"", TokenType::MINUS}); + tokens_.push_back({"", original_text, TokenType::MINUS}); break; default: return false; @@ -82,18 +84,19 @@ bool Lexer::ConsumeScoringSingleChar() { } bool Lexer::ConsumeGeneralSingleChar() { + std::string_view original_text = query_.substr(current_index_, 1); switch (current_char_) { case ',': - tokens_.push_back({"", TokenType::COMMA}); + tokens_.push_back({"", original_text, TokenType::COMMA}); break; case '.': - tokens_.push_back({"", TokenType::DOT}); + tokens_.push_back({"", original_text, TokenType::DOT}); break; case '(': - tokens_.push_back({"", TokenType::LPAREN}); + tokens_.push_back({"", original_text, TokenType::LPAREN}); break; case ')': - tokens_.push_back({"", TokenType::RPAREN}); + tokens_.push_back({"", original_text, TokenType::RPAREN}); break; default: return false; @@ -124,13 +127,17 @@ bool Lexer::ConsumeComparator() { // Matching for '<=', '>=', '!=', or '=='. char next_char = PeekNext(1); if (next_char == '=') { - tokens_.push_back({{current_char_, next_char}, TokenType::COMPARATOR}); + tokens_.push_back({{current_char_, next_char}, + query_.substr(current_index_, 2), + TokenType::COMPARATOR}); Advance(2); return true; } // Now, next_char must not be '='. Let's match for '<' and '>'. if (current_char_ == '<' || current_char_ == '>') { - tokens_.push_back({{current_char_}, TokenType::COMPARATOR}); + tokens_.push_back({{current_char_}, + query_.substr(current_index_, 1), + TokenType::COMPARATOR}); Advance(); return true; } @@ -145,10 +152,11 @@ bool Lexer::ConsumeAndOr() { if (current_char_ != next_char) { return false; } + std::string_view original_text = query_.substr(current_index_, 2); if (current_char_ == '&') { - tokens_.push_back({"", TokenType::AND}); + tokens_.push_back({"", original_text, TokenType::AND}); } else { - tokens_.push_back({"", TokenType::OR}); + tokens_.push_back({"", original_text, TokenType::OR}); } Advance(2); return true; @@ -158,37 +166,42 @@ bool Lexer::ConsumeStringLiteral() { if (current_char_ != '"') { return false; } - std::string text; Advance(); + int32_t unnormalized_start_pos = current_index_; while (current_char_ != '\0' && current_char_ != '"') { // When getting a backslash, we will always match the next character, even // if the next character is a quotation mark if (current_char_ == '\\') { - text.push_back(current_char_); Advance(); if (current_char_ == '\0') { // In this case, we are missing a terminating quotation mark. break; } } - text.push_back(current_char_); Advance(); } if (current_char_ == '\0') { SyntaxError("missing terminating \" character"); return false; } - tokens_.push_back({text, TokenType::STRING}); + int32_t unnormalized_length = current_index_ - unnormalized_start_pos; + std::string_view raw_token_text = + query_.substr(unnormalized_start_pos, unnormalized_length); + std::string token_text(raw_token_text); + tokens_.push_back({std::move(token_text), raw_token_text, TokenType::STRING}); Advance(); return true; } -bool Lexer::Text() { +bool Lexer::ConsumeText() { if (current_char_ == '\0') { return false; } - tokens_.push_back({"", TokenType::TEXT}); + tokens_.push_back({"", query_.substr(current_index_, 0), TokenType::TEXT}); int token_index = tokens_.size() - 1; + + int32_t unnormalized_start_pos = current_index_; + int32_t unnormalized_end_pos = current_index_; while (!ConsumeNonText() && current_char_ != '\0') { in_text_ = true; // When getting a backslash in TEXT, unescape it by accepting its following @@ -203,14 +216,18 @@ bool Lexer::Text() { } tokens_[token_index].text.push_back(current_char_); Advance(); - if (current_char_ == '(') { - // A TEXT followed by a LPAREN is a FUNCTION_NAME. - tokens_.back().type = TokenType::FUNCTION_NAME; - // No need to break, since NonText() must be true at this point. - } + unnormalized_end_pos = current_index_; } in_text_ = false; + tokens_[token_index].original_text = query_.substr( + unnormalized_start_pos, unnormalized_end_pos - unnormalized_start_pos); + if (unnormalized_end_pos < query_.length() && + query_[unnormalized_end_pos] == '(') { + // A TEXT followed by a LPAREN is a FUNCTION_NAME. + tokens_[token_index].type = TokenType::FUNCTION_NAME; + } + if (language_ == Lexer::Language::QUERY) { std::string &text = tokens_[token_index].text; TokenType &type = tokens_[token_index].type; @@ -234,7 +251,7 @@ Lexer::ExtractTokens() { // Clear out any non-text before matching a Text. while (ConsumeNonText()) { } - Text(); + ConsumeText(); } if (!error_.empty()) { return absl_ports::InvalidArgumentError( diff --git a/icing/query/advanced_query_parser/lexer.h b/icing/query/advanced_query_parser/lexer.h index f7f06dc..b313fa7 100644 --- a/icing/query/advanced_query_parser/lexer.h +++ b/icing/query/advanced_query_parser/lexer.h @@ -48,7 +48,9 @@ class Lexer { AND, // 'AND' | '&&' Not allowed in SCORING language. OR, // 'OR' | '||' Not allowed in SCORING language. NOT, // 'NOT' Not allowed in SCORING language. - STRING, // String literal surrounded by quotation marks + STRING, // String literal surrounded by quotation marks. The + // original_text of a STRING token will not include quotation + // marks. TEXT, // A sequence of chars that are not any above-listed operator FUNCTION_NAME, // A TEXT followed by LPAREN. // Whitespaces not inside a string literal will be skipped. @@ -69,6 +71,10 @@ class Lexer { // For other types, this field will be empty. std::string text; + // Lifecycle is dependent on the lifecycle of the string pointed to by + // query_. + std::string_view original_text; + // The type of the token. TokenType type; }; @@ -141,8 +147,9 @@ class Lexer { } // Try to match TEXT, FUNCTION_NAME, 'AND', 'OR' and 'NOT'. - // Should make sure that NonText() is false before calling into this method. - bool Text(); + // REQUIRES: ConsumeNonText() must be called immediately before calling this + // function. + bool ConsumeText(); std::string_view query_; std::string error_; diff --git a/icing/query/advanced_query_parser/parser.cc b/icing/query/advanced_query_parser/parser.cc index 0e4c78d..fd74561 100644 --- a/icing/query/advanced_query_parser/parser.cc +++ b/icing/query/advanced_query_parser/parser.cc @@ -55,7 +55,8 @@ libtextclassifier3::StatusOr<std::unique_ptr<TextNode>> Parser::ConsumeText() { if (!Match(Lexer::TokenType::TEXT)) { return absl_ports::InvalidArgumentError("Unable to consume token as TEXT."); } - auto text_node = std::make_unique<TextNode>(std::move(current_token_->text)); + auto text_node = std::make_unique<TextNode>(std::move(current_token_->text), + current_token_->original_text); ++current_token_; return text_node; } @@ -81,6 +82,7 @@ Parser::ConsumeStringElement() { "Unable to consume token as STRING."); } std::string text = std::move(current_token_->text); + std::string_view raw_text = current_token_->original_text; ++current_token_; bool is_prefix = false; @@ -89,7 +91,7 @@ Parser::ConsumeStringElement() { ++current_token_; } - return std::make_unique<StringNode>(std::move(text), is_prefix); + return std::make_unique<StringNode>(std::move(text), raw_text, is_prefix); } libtextclassifier3::StatusOr<std::string> Parser::ConsumeComparator() { @@ -115,7 +117,9 @@ Parser::ConsumeMember() { // at this point. So check for 'STAR' to differentiate the two cases. if (Match(Lexer::TokenType::STAR)) { Consume(Lexer::TokenType::STAR); - text_node = std::make_unique<TextNode>(std::move(*text_node).value(), + std::string_view raw_text = text_node->raw_value(); + std::string text = std::move(*text_node).value(); + text_node = std::make_unique<TextNode>(std::move(text), raw_text, /*is_prefix=*/true); children.push_back(std::move(text_node)); } else { diff --git a/icing/query/advanced_query_parser/parser_test.cc b/icing/query/advanced_query_parser/parser_test.cc index 502dbd3..824c2ce 100644 --- a/icing/query/advanced_query_parser/parser_test.cc +++ b/icing/query/advanced_query_parser/parser_test.cc @@ -46,9 +46,9 @@ TEST(ParserTest, EmptyScoring) { } TEST(ParserTest, SingleTerm) { - // Query: "foo" + std::string_view query = "foo"; std::vector<Lexer::LexerToken> lexer_tokens = { - {"foo", Lexer::TokenType::TEXT}}; + {"foo", query, Lexer::TokenType::TEXT}}; Parser parser = Parser::Create(std::move(lexer_tokens)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root, parser.ConsumeQuery()); @@ -67,9 +67,10 @@ TEST(ParserTest, SingleTerm) { } TEST(ParserTest, ImplicitAnd) { - // Query: "foo bar" + std::string_view query = "foo bar"; std::vector<Lexer::LexerToken> lexer_tokens = { - {"foo", Lexer::TokenType::TEXT}, {"bar", Lexer::TokenType::TEXT}}; + {"foo", query.substr(0, 3), Lexer::TokenType::TEXT}, + {"bar", query.substr(4, 3), Lexer::TokenType::TEXT}}; Parser parser = Parser::Create(std::move(lexer_tokens)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root, parser.ConsumeQuery()); @@ -93,11 +94,11 @@ TEST(ParserTest, ImplicitAnd) { } TEST(ParserTest, Or) { - // Query: "foo OR bar" + std::string_view query = "foo OR bar"; std::vector<Lexer::LexerToken> lexer_tokens = { - {"foo", Lexer::TokenType::TEXT}, - {"", Lexer::TokenType::OR}, - {"bar", Lexer::TokenType::TEXT}}; + {"foo", query.substr(0, 3), Lexer::TokenType::TEXT}, + {"", query.substr(4, 2), Lexer::TokenType::OR}, + {"bar", query.substr(7, 3), Lexer::TokenType::TEXT}}; Parser parser = Parser::Create(std::move(lexer_tokens)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root, parser.ConsumeQuery()); @@ -121,11 +122,11 @@ TEST(ParserTest, Or) { } TEST(ParserTest, And) { - // Query: "foo AND bar" + std::string_view query = "foo AND bar"; std::vector<Lexer::LexerToken> lexer_tokens = { - {"foo", Lexer::TokenType::TEXT}, - {"", Lexer::TokenType::AND}, - {"bar", Lexer::TokenType::TEXT}}; + {"foo", query.substr(0, 3), Lexer::TokenType::TEXT}, + {"", query.substr(4, 3), Lexer::TokenType::AND}, + {"bar", query.substr(8, 4), Lexer::TokenType::TEXT}}; Parser parser = Parser::Create(std::move(lexer_tokens)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root, parser.ConsumeQuery()); @@ -149,9 +150,10 @@ TEST(ParserTest, And) { } TEST(ParserTest, Not) { - // Query: "NOT foo" + std::string_view query = "NOT foo"; std::vector<Lexer::LexerToken> lexer_tokens = { - {"", Lexer::TokenType::NOT}, {"foo", Lexer::TokenType::TEXT}}; + {"", query.substr(0, 3), Lexer::TokenType::NOT}, + {"foo", query.substr(4, 3), Lexer::TokenType::TEXT}}; Parser parser = Parser::Create(std::move(lexer_tokens)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root, parser.ConsumeQuery()); @@ -173,9 +175,10 @@ TEST(ParserTest, Not) { } TEST(ParserTest, Minus) { - // Query: "-foo" + std::string_view query = "-foo"; std::vector<Lexer::LexerToken> lexer_tokens = { - {"", Lexer::TokenType::MINUS}, {"foo", Lexer::TokenType::TEXT}}; + {"", query.substr(0, 1), Lexer::TokenType::MINUS}, + {"foo", query.substr(1, 3), Lexer::TokenType::TEXT}}; Parser parser = Parser::Create(std::move(lexer_tokens)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root, parser.ConsumeQuery()); @@ -197,11 +200,11 @@ TEST(ParserTest, Minus) { } TEST(ParserTest, Has) { - // Query: "subject:foo" + std::string_view query = "subject:foo"; std::vector<Lexer::LexerToken> lexer_tokens = { - {"subject", Lexer::TokenType::TEXT}, - {":", Lexer::TokenType::COMPARATOR}, - {"foo", Lexer::TokenType::TEXT}}; + {"subject", query.substr(0, 7), Lexer::TokenType::TEXT}, + {":", query.substr(7, 1), Lexer::TokenType::COMPARATOR}, + {"foo", query.substr(8, 3), Lexer::TokenType::TEXT}}; Parser parser = Parser::Create(std::move(lexer_tokens)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root, parser.ConsumeQuery()); @@ -225,13 +228,13 @@ TEST(ParserTest, Has) { } TEST(ParserTest, HasNested) { - // Query: "sender.name:foo" + std::string_view query = "sender.name:foo"; std::vector<Lexer::LexerToken> lexer_tokens = { - {"sender", Lexer::TokenType::TEXT}, - {"", Lexer::TokenType::DOT}, - {"name", Lexer::TokenType::TEXT}, - {":", Lexer::TokenType::COMPARATOR}, - {"foo", Lexer::TokenType::TEXT}}; + {"sender", query.substr(0, 6), Lexer::TokenType::TEXT}, + {"", query.substr(6, 1), Lexer::TokenType::DOT}, + {"name", query.substr(7, 4), Lexer::TokenType::TEXT}, + {":", query.substr(11, 1), Lexer::TokenType::COMPARATOR}, + {"foo", query.substr(12, 3), Lexer::TokenType::TEXT}}; Parser parser = Parser::Create(std::move(lexer_tokens)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root, parser.ConsumeQuery()); @@ -256,11 +259,11 @@ TEST(ParserTest, HasNested) { } TEST(ParserTest, EmptyFunction) { - // Query: "foo()" + std::string_view query = "foo()"; std::vector<Lexer::LexerToken> lexer_tokens = { - {"foo", Lexer::TokenType::FUNCTION_NAME}, - {"", Lexer::TokenType::LPAREN}, - {"", Lexer::TokenType::RPAREN}}; + {"foo", query.substr(0, 3), Lexer::TokenType::FUNCTION_NAME}, + {"", query.substr(3, 1), Lexer::TokenType::LPAREN}, + {"", query.substr(4, 1), Lexer::TokenType::RPAREN}}; Parser parser = Parser::Create(std::move(lexer_tokens)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root, parser.ConsumeQuery()); @@ -279,12 +282,12 @@ TEST(ParserTest, EmptyFunction) { } TEST(ParserTest, FunctionSingleArg) { - // Query: "foo("bar")" + std::string_view query = "foo(\"bar\")"; std::vector<Lexer::LexerToken> lexer_tokens = { - {"foo", Lexer::TokenType::FUNCTION_NAME}, - {"", Lexer::TokenType::LPAREN}, - {"bar", Lexer::TokenType::STRING}, - {"", Lexer::TokenType::RPAREN}}; + {"foo", query.substr(0, 3), Lexer::TokenType::FUNCTION_NAME}, + {"", query.substr(3, 1), Lexer::TokenType::LPAREN}, + {"bar", query.substr(5, 3), Lexer::TokenType::STRING}, + {"", query.substr(8, 1), Lexer::TokenType::RPAREN}}; Parser parser = Parser::Create(std::move(lexer_tokens)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root, parser.ConsumeQuery()); @@ -304,11 +307,14 @@ TEST(ParserTest, FunctionSingleArg) { } TEST(ParserTest, FunctionMultiArg) { - // Query: "foo("bar", "baz")" + std::string_view query = "foo(\"bar\", \"baz\")"; std::vector<Lexer::LexerToken> lexer_tokens = { - {"foo", Lexer::TokenType::FUNCTION_NAME}, {"", Lexer::TokenType::LPAREN}, - {"bar", Lexer::TokenType::STRING}, {"", Lexer::TokenType::COMMA}, - {"baz", Lexer::TokenType::STRING}, {"", Lexer::TokenType::RPAREN}}; + {"foo", query.substr(0, 3), Lexer::TokenType::FUNCTION_NAME}, + {"", query.substr(3, 1), Lexer::TokenType::LPAREN}, + {"bar", query.substr(5, 3), Lexer::TokenType::STRING}, + {"", query.substr(9, 1), Lexer::TokenType::COMMA}, + {"baz", query.substr(12, 3), Lexer::TokenType::STRING}, + {"", query.substr(16, 1), Lexer::TokenType::RPAREN}}; Parser parser = Parser::Create(std::move(lexer_tokens)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root, parser.ConsumeQuery()); @@ -329,11 +335,14 @@ TEST(ParserTest, FunctionMultiArg) { } TEST(ParserTest, FunctionNested) { - // Query: "foo(bar())" + std::string_view query = "foo(bar())"; std::vector<Lexer::LexerToken> lexer_tokens = { - {"foo", Lexer::TokenType::FUNCTION_NAME}, {"", Lexer::TokenType::LPAREN}, - {"bar", Lexer::TokenType::FUNCTION_NAME}, {"", Lexer::TokenType::LPAREN}, - {"", Lexer::TokenType::RPAREN}, {"", Lexer::TokenType::RPAREN}}; + {"foo", query.substr(0, 3), Lexer::TokenType::FUNCTION_NAME}, + {"", query.substr(3, 1), Lexer::TokenType::LPAREN}, + {"bar", query.substr(4, 3), Lexer::TokenType::FUNCTION_NAME}, + {"", query.substr(7, 1), Lexer::TokenType::LPAREN}, + {"", query.substr(8, 1), Lexer::TokenType::RPAREN}, + {"", query.substr(9, 1), Lexer::TokenType::RPAREN}}; Parser parser = Parser::Create(std::move(lexer_tokens)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root, parser.ConsumeQuery()); @@ -356,13 +365,13 @@ TEST(ParserTest, FunctionNested) { } TEST(ParserTest, FunctionWithTrailingSequence) { - // Query: "foo() OR bar" + std::string_view query = "foo() OR bar"; std::vector<Lexer::LexerToken> lexer_tokens = { - {"foo", Lexer::TokenType::FUNCTION_NAME}, - {"", Lexer::TokenType::LPAREN}, - {"", Lexer::TokenType::RPAREN}, - {"", Lexer::TokenType::OR}, - {"bar", Lexer::TokenType::TEXT}}; + {"foo", query.substr(0, 3), Lexer::TokenType::FUNCTION_NAME}, + {"", query.substr(3, 1), Lexer::TokenType::LPAREN}, + {"", query.substr(4, 1), Lexer::TokenType::RPAREN}, + {"", query.substr(6, 2), Lexer::TokenType::OR}, + {"bar", query.substr(9, 3), Lexer::TokenType::TEXT}}; Parser parser = Parser::Create(std::move(lexer_tokens)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root, parser.ConsumeQuery()); @@ -386,11 +395,14 @@ TEST(ParserTest, FunctionWithTrailingSequence) { } TEST(ParserTest, Composite) { - // Query: "foo OR (bar baz)" + std::string_view query = "foo OR (bar baz)"; std::vector<Lexer::LexerToken> lexer_tokens = { - {"foo", Lexer::TokenType::TEXT}, {"", Lexer::TokenType::OR}, - {"", Lexer::TokenType::LPAREN}, {"bar", Lexer::TokenType::TEXT}, - {"baz", Lexer::TokenType::TEXT}, {"", Lexer::TokenType::RPAREN}}; + {"foo", query.substr(0, 3), Lexer::TokenType::TEXT}, + {"", query.substr(4, 2), Lexer::TokenType::OR}, + {"", query.substr(7, 1), Lexer::TokenType::LPAREN}, + {"bar", query.substr(8, 3), Lexer::TokenType::TEXT}, + {"baz", query.substr(12, 3), Lexer::TokenType::TEXT}, + {"", query.substr(15, 1), Lexer::TokenType::RPAREN}}; Parser parser = Parser::Create(std::move(lexer_tokens)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root, parser.ConsumeQuery()); @@ -419,11 +431,14 @@ TEST(ParserTest, Composite) { } TEST(ParserTest, CompositeWithTrailingSequence) { - // Query: "(bar baz) OR foo" + std::string_view query = "(bar baz) OR foo"; std::vector<Lexer::LexerToken> lexer_tokens = { - {"", Lexer::TokenType::LPAREN}, {"bar", Lexer::TokenType::TEXT}, - {"baz", Lexer::TokenType::TEXT}, {"", Lexer::TokenType::RPAREN}, - {"", Lexer::TokenType::OR}, {"foo", Lexer::TokenType::TEXT}}; + {"", query.substr(0, 1), Lexer::TokenType::LPAREN}, + {"bar", query.substr(1, 3), Lexer::TokenType::TEXT}, + {"baz", query.substr(5, 3), Lexer::TokenType::TEXT}, + {"", query.substr(8, 1), Lexer::TokenType::RPAREN}, + {"", query.substr(10, 2), Lexer::TokenType::OR}, + {"foo", query.substr(13, 3), Lexer::TokenType::TEXT}}; Parser parser = Parser::Create(std::move(lexer_tokens)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root, parser.ConsumeQuery()); @@ -452,17 +467,17 @@ TEST(ParserTest, CompositeWithTrailingSequence) { } TEST(ParserTest, Complex) { - // Query: "foo bar:baz OR pal("bat")" + std::string_view query = R"(foo bar:baz OR pal("bat"))"; std::vector<Lexer::LexerToken> lexer_tokens = { - {"foo", Lexer::TokenType::TEXT}, - {"bar", Lexer::TokenType::TEXT}, - {":", Lexer::TokenType::COMPARATOR}, - {"baz", Lexer::TokenType::TEXT}, - {"", Lexer::TokenType::OR}, - {"pal", Lexer::TokenType::FUNCTION_NAME}, - {"", Lexer::TokenType::LPAREN}, - {"bat", Lexer::TokenType::STRING}, - {"", Lexer::TokenType::RPAREN}}; + {"foo", query.substr(0, 3), Lexer::TokenType::TEXT}, + {"bar", query.substr(4, 3), Lexer::TokenType::TEXT}, + {":", query.substr(7, 1), Lexer::TokenType::COMPARATOR}, + {"baz", query.substr(8, 3), Lexer::TokenType::TEXT}, + {"", query.substr(12, 2), Lexer::TokenType::OR}, + {"pal", query.substr(15, 3), Lexer::TokenType::FUNCTION_NAME}, + {"", query.substr(18, 1), Lexer::TokenType::LPAREN}, + {"bat", query.substr(20, 3), Lexer::TokenType::STRING}, + {"", query.substr(24, 1), Lexer::TokenType::RPAREN}}; Parser parser = Parser::Create(std::move(lexer_tokens)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root, parser.ConsumeQuery()); @@ -498,107 +513,116 @@ TEST(ParserTest, Complex) { } TEST(ParserTest, InvalidHas) { - // Query: "foo:" No right hand operand to : + std::string_view query = "foo:"; // No right hand operand to : std::vector<Lexer::LexerToken> lexer_tokens = { - {"foo", Lexer::TokenType::TEXT}, {":", Lexer::TokenType::COMPARATOR}}; + {"foo", query.substr(0, 3), Lexer::TokenType::TEXT}, + {":", query.substr(3, 1), Lexer::TokenType::COMPARATOR}}; Parser parser = Parser::Create(std::move(lexer_tokens)); EXPECT_THAT(parser.ConsumeQuery(), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } TEST(ParserTest, InvalidComposite) { - // Query: "(foo bar" No terminating RPAREN + std::string_view query = "(foo bar"; // No terminating RPAREN std::vector<Lexer::LexerToken> lexer_tokens = { - {"", Lexer::TokenType::LPAREN}, - {"foo", Lexer::TokenType::TEXT}, - {"bar", Lexer::TokenType::TEXT}}; + {"", query.substr(0, 1), Lexer::TokenType::LPAREN}, + {"foo", query.substr(1, 3), Lexer::TokenType::TEXT}, + {"bar", query.substr(5, 3), Lexer::TokenType::TEXT}}; Parser parser = Parser::Create(std::move(lexer_tokens)); EXPECT_THAT(parser.ConsumeQuery(), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } TEST(ParserTest, InvalidMember) { - // Query: "foo." DOT must have succeeding TEXT + std::string_view query = "foo."; // DOT must have succeeding TEXT std::vector<Lexer::LexerToken> lexer_tokens = { - {"foo", Lexer::TokenType::TEXT}, {"", Lexer::TokenType::DOT}}; + {"foo", query.substr(0, 3), Lexer::TokenType::TEXT}, + {"", query.substr(3, 1), Lexer::TokenType::DOT}}; Parser parser = Parser::Create(std::move(lexer_tokens)); EXPECT_THAT(parser.ConsumeQuery(), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } TEST(ParserTest, InvalidOr) { - // Query: "foo OR" No right hand operand to OR + std::string_view query = "foo OR"; // No right hand operand to OR std::vector<Lexer::LexerToken> lexer_tokens = { - {"foo", Lexer::TokenType::TEXT}, {"", Lexer::TokenType::OR}}; + {"foo", query.substr(0, 3), Lexer::TokenType::TEXT}, + {"", query.substr(3, 2), Lexer::TokenType::OR}}; Parser parser = Parser::Create(std::move(lexer_tokens)); EXPECT_THAT(parser.ConsumeQuery(), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } TEST(ParserTest, InvalidAnd) { - // Query: "foo AND" No right hand operand to AND + std::string_view query = "foo AND"; // No right hand operand to AND std::vector<Lexer::LexerToken> lexer_tokens = { - {"foo", Lexer::TokenType::TEXT}, {"", Lexer::TokenType::AND}}; + {"foo", query.substr(0, 3), Lexer::TokenType::TEXT}, + {"", query.substr(4, 3), Lexer::TokenType::AND}}; Parser parser = Parser::Create(std::move(lexer_tokens)); EXPECT_THAT(parser.ConsumeQuery(), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } TEST(ParserTest, InvalidNot) { - // Query: "NOT" No right hand operand to NOT - std::vector<Lexer::LexerToken> lexer_tokens = {{"", Lexer::TokenType::NOT}}; + std::string_view query = "NOT"; // No right hand operand to NOT + std::vector<Lexer::LexerToken> lexer_tokens = { + {"", query.substr(0, 3), Lexer::TokenType::NOT}}; Parser parser = Parser::Create(std::move(lexer_tokens)); EXPECT_THAT(parser.ConsumeQuery(), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } TEST(ParserTest, InvalidMinus) { - // Query: "-" No right hand operand to - - std::vector<Lexer::LexerToken> lexer_tokens = {{"", Lexer::TokenType::MINUS}}; + std::string_view query = "-"; // No right hand operand to - + std::vector<Lexer::LexerToken> lexer_tokens = { + {"", query.substr(0, 1), Lexer::TokenType::MINUS}}; Parser parser = Parser::Create(std::move(lexer_tokens)); EXPECT_THAT(parser.ConsumeQuery(), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } TEST(ParserTest, InvalidFunctionCallNoRparen) { - // Query: "foo(" No terminating RPAREN + std::string_view query = "foo("; // No terminating RPAREN std::vector<Lexer::LexerToken> lexer_tokens = { - {"foo", Lexer::TokenType::FUNCTION_NAME}, {"", Lexer::TokenType::LPAREN}}; + {"foo", query.substr(0, 3), Lexer::TokenType::FUNCTION_NAME}, + {"", query.substr(3, 0), Lexer::TokenType::LPAREN}}; Parser parser = Parser::Create(std::move(lexer_tokens)); EXPECT_THAT(parser.ConsumeQuery(), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } TEST(ParserTest, InvalidFunctionCallNoLparen) { - // Query: "foo bar" foo labeled FUNCTION_NAME despite no LPAREN + std::string_view query = + "foo bar"; // foo labeled FUNCTION_NAME despite no LPAREN std::vector<Lexer::LexerToken> lexer_tokens = { - {"foo", Lexer::TokenType::FUNCTION_NAME}, - {"bar", Lexer::TokenType::FUNCTION_NAME}}; + {"foo", query.substr(0, 3), Lexer::TokenType::FUNCTION_NAME}, + {"bar", query.substr(4, 3), Lexer::TokenType::FUNCTION_NAME}}; Parser parser = Parser::Create(std::move(lexer_tokens)); EXPECT_THAT(parser.ConsumeQuery(), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } TEST(ParserTest, InvalidFunctionArgsHangingComma) { - // Query: "foo("bar",)" no valid arg following COMMA + std::string_view query = R"(foo("bar",))"; // no valid arg following COMMA std::vector<Lexer::LexerToken> lexer_tokens = { - {"foo", Lexer::TokenType::FUNCTION_NAME}, - {"", Lexer::TokenType::LPAREN}, - {"bar", Lexer::TokenType::STRING}, - {"", Lexer::TokenType::COMMA}, - {"", Lexer::TokenType::RPAREN}}; + {"foo", query.substr(0, 3), Lexer::TokenType::FUNCTION_NAME}, + {"", query.substr(3, 1), Lexer::TokenType::LPAREN}, + {"bar", query.substr(5, 3), Lexer::TokenType::STRING}, + {"", query.substr(9, 1), Lexer::TokenType::COMMA}, + {"", query.substr(10, 1), Lexer::TokenType::RPAREN}}; Parser parser = Parser::Create(std::move(lexer_tokens)); EXPECT_THAT(parser.ConsumeQuery(), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); } TEST(ParserTest, ScoringPlus) { - // Scoring: "1 + 1 + 1" - std::vector<Lexer::LexerToken> lexer_tokens = {{"1", Lexer::TokenType::TEXT}, - {"", Lexer::TokenType::PLUS}, - {"1", Lexer::TokenType::TEXT}, - {"", Lexer::TokenType::PLUS}, - {"1", Lexer::TokenType::TEXT}}; + std::string_view scoring_exp = "1 + 1 + 1"; + std::vector<Lexer::LexerToken> lexer_tokens = { + {"1", scoring_exp.substr(0, 1), Lexer::TokenType::TEXT}, + {"", scoring_exp.substr(2, 1), Lexer::TokenType::PLUS}, + {"1", scoring_exp.substr(4, 1), Lexer::TokenType::TEXT}, + {"", scoring_exp.substr(6, 1), Lexer::TokenType::PLUS}, + {"1", scoring_exp.substr(8, 1), Lexer::TokenType::TEXT}}; Parser parser = Parser::Create(std::move(lexer_tokens)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root, parser.ConsumeScoring()); @@ -622,12 +646,13 @@ TEST(ParserTest, ScoringPlus) { } TEST(ParserTest, ScoringMinus) { - // Scoring: "1 - 1 - 1" - std::vector<Lexer::LexerToken> lexer_tokens = {{"1", Lexer::TokenType::TEXT}, - {"", Lexer::TokenType::MINUS}, - {"1", Lexer::TokenType::TEXT}, - {"", Lexer::TokenType::MINUS}, - {"1", Lexer::TokenType::TEXT}}; + std::string_view scoring_exp = "1 - 1 - 1"; + std::vector<Lexer::LexerToken> lexer_tokens = { + {"1", scoring_exp.substr(0, 1), Lexer::TokenType::TEXT}, + {"", scoring_exp.substr(2, 1), Lexer::TokenType::MINUS}, + {"1", scoring_exp.substr(4, 1), Lexer::TokenType::TEXT}, + {"", scoring_exp.substr(6, 1), Lexer::TokenType::MINUS}, + {"1", scoring_exp.substr(8, 1), Lexer::TokenType::TEXT}}; Parser parser = Parser::Create(std::move(lexer_tokens)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root, parser.ConsumeScoring()); @@ -651,11 +676,14 @@ TEST(ParserTest, ScoringMinus) { } TEST(ParserTest, ScoringUnaryMinus) { - // Scoring: "1 + -1 + 1" + std::string_view scoring_exp = "1 + -1 + 1"; std::vector<Lexer::LexerToken> lexer_tokens = { - {"1", Lexer::TokenType::TEXT}, {"", Lexer::TokenType::PLUS}, - {"", Lexer::TokenType::MINUS}, {"1", Lexer::TokenType::TEXT}, - {"", Lexer::TokenType::PLUS}, {"1", Lexer::TokenType::TEXT}}; + {"1", scoring_exp.substr(0, 1), Lexer::TokenType::TEXT}, + {"", scoring_exp.substr(2, 1), Lexer::TokenType::PLUS}, + {"", scoring_exp.substr(4, 1), Lexer::TokenType::MINUS}, + {"1", scoring_exp.substr(5, 1), Lexer::TokenType::TEXT}, + {"", scoring_exp.substr(7, 1), Lexer::TokenType::PLUS}, + {"1", scoring_exp.substr(9, 1), Lexer::TokenType::TEXT}}; Parser parser = Parser::Create(std::move(lexer_tokens)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root, parser.ConsumeScoring()); @@ -682,12 +710,15 @@ TEST(ParserTest, ScoringUnaryMinus) { } TEST(ParserTest, ScoringPlusMinus) { - // Scoring: "11 + 12 - 13 + 14" + std::string_view scoring_exp = "11 + 12 - 13 + 14"; std::vector<Lexer::LexerToken> lexer_tokens = { - {"11", Lexer::TokenType::TEXT}, {"", Lexer::TokenType::PLUS}, - {"12", Lexer::TokenType::TEXT}, {"", Lexer::TokenType::MINUS}, - {"13", Lexer::TokenType::TEXT}, {"", Lexer::TokenType::PLUS}, - {"14", Lexer::TokenType::TEXT}}; + {"11", scoring_exp.substr(0, 2), Lexer::TokenType::TEXT}, + {"", scoring_exp.substr(3, 1), Lexer::TokenType::PLUS}, + {"12", scoring_exp.substr(5, 2), Lexer::TokenType::TEXT}, + {"", scoring_exp.substr(6, 1), Lexer::TokenType::MINUS}, + {"13", scoring_exp.substr(8, 2), Lexer::TokenType::TEXT}, + {"", scoring_exp.substr(11, 1), Lexer::TokenType::PLUS}, + {"14", scoring_exp.substr(13, 2), Lexer::TokenType::TEXT}}; Parser parser = Parser::Create(std::move(lexer_tokens)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root, parser.ConsumeScoring()); @@ -719,12 +750,13 @@ TEST(ParserTest, ScoringPlusMinus) { } TEST(ParserTest, ScoringTimes) { - // Scoring: "1 * 1 * 1" - std::vector<Lexer::LexerToken> lexer_tokens = {{"1", Lexer::TokenType::TEXT}, - {"", Lexer::TokenType::TIMES}, - {"1", Lexer::TokenType::TEXT}, - {"", Lexer::TokenType::TIMES}, - {"1", Lexer::TokenType::TEXT}}; + std::string_view scoring_exp = "1 * 1 * 1"; + std::vector<Lexer::LexerToken> lexer_tokens = { + {"1", scoring_exp.substr(0, 1), Lexer::TokenType::TEXT}, + {"", scoring_exp.substr(2, 1), Lexer::TokenType::TIMES}, + {"1", scoring_exp.substr(4, 1), Lexer::TokenType::TEXT}, + {"", scoring_exp.substr(6, 1), Lexer::TokenType::TIMES}, + {"1", scoring_exp.substr(8, 1), Lexer::TokenType::TEXT}}; Parser parser = Parser::Create(std::move(lexer_tokens)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root, parser.ConsumeScoring()); @@ -748,12 +780,13 @@ TEST(ParserTest, ScoringTimes) { } TEST(ParserTest, ScoringDiv) { - // Scoring: "1 / 1 / 1" - std::vector<Lexer::LexerToken> lexer_tokens = {{"1", Lexer::TokenType::TEXT}, - {"", Lexer::TokenType::DIV}, - {"1", Lexer::TokenType::TEXT}, - {"", Lexer::TokenType::DIV}, - {"1", Lexer::TokenType::TEXT}}; + std::string_view scoring_exp = "1 / 1 / 1"; + std::vector<Lexer::LexerToken> lexer_tokens = { + {"1", scoring_exp.substr(0, 1), Lexer::TokenType::TEXT}, + {"", scoring_exp.substr(2, 1), Lexer::TokenType::DIV}, + {"1", scoring_exp.substr(4, 1), Lexer::TokenType::TEXT}, + {"", scoring_exp.substr(6, 1), Lexer::TokenType::DIV}, + {"1", scoring_exp.substr(8, 1), Lexer::TokenType::TEXT}}; Parser parser = Parser::Create(std::move(lexer_tokens)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root, parser.ConsumeScoring()); @@ -777,13 +810,17 @@ TEST(ParserTest, ScoringDiv) { } TEST(ParserTest, ScoringTimesDiv) { - // Scoring: "11 / 12 * 13 / 14 / 15" + std::string_view scoring_exp = "11 / 12 * 13 / 14 / 15"; std::vector<Lexer::LexerToken> lexer_tokens = { - {"11", Lexer::TokenType::TEXT}, {"", Lexer::TokenType::DIV}, - {"12", Lexer::TokenType::TEXT}, {"", Lexer::TokenType::TIMES}, - {"13", Lexer::TokenType::TEXT}, {"", Lexer::TokenType::DIV}, - {"14", Lexer::TokenType::TEXT}, {"", Lexer::TokenType::DIV}, - {"15", Lexer::TokenType::TEXT}}; + {"11", scoring_exp.substr(0, 2), Lexer::TokenType::TEXT}, + {"", scoring_exp.substr(3, 1), Lexer::TokenType::DIV}, + {"12", scoring_exp.substr(5, 2), Lexer::TokenType::TEXT}, + {"", scoring_exp.substr(8, 1), Lexer::TokenType::TIMES}, + {"13", scoring_exp.substr(10, 2), Lexer::TokenType::TEXT}, + {"", scoring_exp.substr(13, 1), Lexer::TokenType::DIV}, + {"14", scoring_exp.substr(15, 2), Lexer::TokenType::TEXT}, + {"", scoring_exp.substr(18, 1), Lexer::TokenType::DIV}, + {"15", scoring_exp.substr(20, 2), Lexer::TokenType::TEXT}}; Parser parser = Parser::Create(std::move(lexer_tokens)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root, parser.ConsumeScoring()); @@ -817,29 +854,29 @@ TEST(ParserTest, ScoringTimesDiv) { } TEST(ParserTest, ComplexScoring) { - // Scoring: "1 + pow((2 * sin(3)), 4) + -5 / 6" + std::string_view scoring_exp = "1 + pow((2 * sin(3)), 4) + -5 / 6"; // With parentheses in function arguments. std::vector<Lexer::LexerToken> lexer_tokens = { - {"1", Lexer::TokenType::TEXT}, - {"", Lexer::TokenType::PLUS}, - {"pow", Lexer::TokenType::FUNCTION_NAME}, - {"", Lexer::TokenType::LPAREN}, - {"", Lexer::TokenType::LPAREN}, - {"2", Lexer::TokenType::TEXT}, - {"", Lexer::TokenType::TIMES}, - {"sin", Lexer::TokenType::FUNCTION_NAME}, - {"", Lexer::TokenType::LPAREN}, - {"3", Lexer::TokenType::TEXT}, - {"", Lexer::TokenType::RPAREN}, - {"", Lexer::TokenType::RPAREN}, - {"", Lexer::TokenType::COMMA}, - {"4", Lexer::TokenType::TEXT}, - {"", Lexer::TokenType::RPAREN}, - {"", Lexer::TokenType::PLUS}, - {"", Lexer::TokenType::MINUS}, - {"5", Lexer::TokenType::TEXT}, - {"", Lexer::TokenType::DIV}, - {"6", Lexer::TokenType::TEXT}, + {"1", scoring_exp.substr(0, 1), Lexer::TokenType::TEXT}, + {"", scoring_exp.substr(2, 1), Lexer::TokenType::PLUS}, + {"pow", scoring_exp.substr(4, 3), Lexer::TokenType::FUNCTION_NAME}, + {"", scoring_exp.substr(7, 1), Lexer::TokenType::LPAREN}, + {"", scoring_exp.substr(8, 1), Lexer::TokenType::LPAREN}, + {"2", scoring_exp.substr(9, 1), Lexer::TokenType::TEXT}, + {"", scoring_exp.substr(11, 1), Lexer::TokenType::TIMES}, + {"sin", scoring_exp.substr(13, 3), Lexer::TokenType::FUNCTION_NAME}, + {"", scoring_exp.substr(16, 1), Lexer::TokenType::LPAREN}, + {"3", scoring_exp.substr(17, 1), Lexer::TokenType::TEXT}, + {"", scoring_exp.substr(18, 1), Lexer::TokenType::RPAREN}, + {"", scoring_exp.substr(19, 1), Lexer::TokenType::RPAREN}, + {"", scoring_exp.substr(20, 1), Lexer::TokenType::COMMA}, + {"4", scoring_exp.substr(22, 1), Lexer::TokenType::TEXT}, + {"", scoring_exp.substr(23, 1), Lexer::TokenType::RPAREN}, + {"", scoring_exp.substr(25, 1), Lexer::TokenType::PLUS}, + {"", scoring_exp.substr(27, 1), Lexer::TokenType::MINUS}, + {"5", scoring_exp.substr(28, 1), Lexer::TokenType::TEXT}, + {"", scoring_exp.substr(30, 1), Lexer::TokenType::DIV}, + {"6", scoring_exp.substr(32, 1), Lexer::TokenType::TEXT}, }; Parser parser = Parser::Create(std::move(lexer_tokens)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root, @@ -869,27 +906,27 @@ TEST(ParserTest, ComplexScoring) { EqualsNodeInfo("DIV", NodeType::kNaryOperator), EqualsNodeInfo("PLUS", NodeType::kNaryOperator))); - // Scoring: "1 + pow(2 * sin(3), 4) + -5 / 6" + scoring_exp = "1 + pow(2 * sin(3), 4) + -5 / 6"; // Without parentheses in function arguments. lexer_tokens = { - {"1", Lexer::TokenType::TEXT}, - {"", Lexer::TokenType::PLUS}, - {"pow", Lexer::TokenType::FUNCTION_NAME}, - {"", Lexer::TokenType::LPAREN}, - {"2", Lexer::TokenType::TEXT}, - {"", Lexer::TokenType::TIMES}, - {"sin", Lexer::TokenType::FUNCTION_NAME}, - {"", Lexer::TokenType::LPAREN}, - {"3", Lexer::TokenType::TEXT}, - {"", Lexer::TokenType::RPAREN}, - {"", Lexer::TokenType::COMMA}, - {"4", Lexer::TokenType::TEXT}, - {"", Lexer::TokenType::RPAREN}, - {"", Lexer::TokenType::PLUS}, - {"", Lexer::TokenType::MINUS}, - {"5", Lexer::TokenType::TEXT}, - {"", Lexer::TokenType::DIV}, - {"6", Lexer::TokenType::TEXT}, + {"1", scoring_exp.substr(0, 1), Lexer::TokenType::TEXT}, + {"", scoring_exp.substr(2, 1), Lexer::TokenType::PLUS}, + {"pow", scoring_exp.substr(4, 3), Lexer::TokenType::FUNCTION_NAME}, + {"", scoring_exp.substr(7, 1), Lexer::TokenType::LPAREN}, + {"2", scoring_exp.substr(8, 1), Lexer::TokenType::TEXT}, + {"", scoring_exp.substr(10, 1), Lexer::TokenType::TIMES}, + {"sin", scoring_exp.substr(12, 3), Lexer::TokenType::FUNCTION_NAME}, + {"", scoring_exp.substr(15, 1), Lexer::TokenType::LPAREN}, + {"3", scoring_exp.substr(16, 1), Lexer::TokenType::TEXT}, + {"", scoring_exp.substr(17, 1), Lexer::TokenType::RPAREN}, + {"", scoring_exp.substr(18, 1), Lexer::TokenType::COMMA}, + {"4", scoring_exp.substr(20, 1), Lexer::TokenType::TEXT}, + {"", scoring_exp.substr(21, 1), Lexer::TokenType::RPAREN}, + {"", scoring_exp.substr(23, 1), Lexer::TokenType::PLUS}, + {"", scoring_exp.substr(25, 1), Lexer::TokenType::MINUS}, + {"5", scoring_exp.substr(26, 1), Lexer::TokenType::TEXT}, + {"", scoring_exp.substr(28, 1), Lexer::TokenType::DIV}, + {"6", scoring_exp.substr(30, 1), Lexer::TokenType::TEXT}, }; parser = Parser::Create(std::move(lexer_tokens)); ICING_ASSERT_OK_AND_ASSIGN(tree_root, parser.ConsumeScoring()); @@ -899,13 +936,14 @@ TEST(ParserTest, ComplexScoring) { } TEST(ParserTest, ScoringMemberFunction) { - // Scoring: this.CreationTimestamp() + std::string_view scoring_exp = "this.CreationTimestamp()"; std::vector<Lexer::LexerToken> lexer_tokens = { - {"this", Lexer::TokenType::TEXT}, - {"", Lexer::TokenType::DOT}, - {"CreationTimestamp", Lexer::TokenType::FUNCTION_NAME}, - {"", Lexer::TokenType::LPAREN}, - {"", Lexer::TokenType::RPAREN}}; + {"this", scoring_exp.substr(0, 4), Lexer::TokenType::TEXT}, + {"", scoring_exp.substr(4, 1), Lexer::TokenType::DOT}, + {"CreationTimestamp", scoring_exp.substr(5, 17), + Lexer::TokenType::FUNCTION_NAME}, + {"", scoring_exp.substr(22, 1), Lexer::TokenType::LPAREN}, + {"", scoring_exp.substr(23, 1), Lexer::TokenType::RPAREN}}; Parser parser = Parser::Create(std::move(lexer_tokens)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root, parser.ConsumeScoring()); @@ -927,13 +965,13 @@ TEST(ParserTest, ScoringMemberFunction) { } TEST(ParserTest, QueryMemberFunction) { - // Query: this.foo() + std::string_view query = "this.foo()"; std::vector<Lexer::LexerToken> lexer_tokens = { - {"this", Lexer::TokenType::TEXT}, - {"", Lexer::TokenType::DOT}, - {"foo", Lexer::TokenType::FUNCTION_NAME}, - {"", Lexer::TokenType::LPAREN}, - {"", Lexer::TokenType::RPAREN}}; + {"this", query.substr(0, 4), Lexer::TokenType::TEXT}, + {"", query.substr(4, 1), Lexer::TokenType::DOT}, + {"foo", query.substr(5, 3), Lexer::TokenType::FUNCTION_NAME}, + {"", query.substr(8, 1), Lexer::TokenType::LPAREN}, + {"", query.substr(9, 1), Lexer::TokenType::RPAREN}}; Parser parser = Parser::Create(std::move(lexer_tokens)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root, parser.ConsumeQuery()); @@ -954,18 +992,18 @@ TEST(ParserTest, QueryMemberFunction) { } TEST(ParserTest, ScoringComplexMemberFunction) { - // Scoring: a.b.fun(c, d) + std::string_view scoring_exp = "a.b.fun(c, d)"; std::vector<Lexer::LexerToken> lexer_tokens = { - {"a", Lexer::TokenType::TEXT}, - {"", Lexer::TokenType::DOT}, - {"b", Lexer::TokenType::TEXT}, - {"", Lexer::TokenType::DOT}, - {"fun", Lexer::TokenType::FUNCTION_NAME}, - {"", Lexer::TokenType::LPAREN}, - {"c", Lexer::TokenType::TEXT}, - {"", Lexer::TokenType::COMMA}, - {"d", Lexer::TokenType::TEXT}, - {"", Lexer::TokenType::RPAREN}}; + {"a", scoring_exp.substr(0, 1), Lexer::TokenType::TEXT}, + {"", scoring_exp.substr(1, 1), Lexer::TokenType::DOT}, + {"b", scoring_exp.substr(2, 1), Lexer::TokenType::TEXT}, + {"", scoring_exp.substr(3, 1), Lexer::TokenType::DOT}, + {"fun", scoring_exp.substr(4, 3), Lexer::TokenType::FUNCTION_NAME}, + {"", scoring_exp.substr(7, 1), Lexer::TokenType::LPAREN}, + {"c", scoring_exp.substr(8, 1), Lexer::TokenType::TEXT}, + {"", scoring_exp.substr(9, 1), Lexer::TokenType::COMMA}, + {"d", scoring_exp.substr(11, 1), Lexer::TokenType::TEXT}, + {"", scoring_exp.substr(12, 1), Lexer::TokenType::RPAREN}}; Parser parser = Parser::Create(std::move(lexer_tokens)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root, parser.ConsumeScoring()); @@ -993,13 +1031,18 @@ TEST(ParserTest, ScoringComplexMemberFunction) { } TEST(ParserTest, QueryComplexMemberFunction) { - // Query: this.abc.fun(def, ghi) + std::string_view query = "this.abc.fun(def, ghi)"; std::vector<Lexer::LexerToken> lexer_tokens = { - {"this", Lexer::TokenType::TEXT}, {"", Lexer::TokenType::DOT}, - {"abc", Lexer::TokenType::TEXT}, {"", Lexer::TokenType::DOT}, - {"fun", Lexer::TokenType::FUNCTION_NAME}, {"", Lexer::TokenType::LPAREN}, - {"def", Lexer::TokenType::TEXT}, {"", Lexer::TokenType::COMMA}, - {"ghi", Lexer::TokenType::TEXT}, {"", Lexer::TokenType::RPAREN}}; + {"this", query.substr(0, 4), Lexer::TokenType::TEXT}, + {"", query.substr(4, 1), Lexer::TokenType::DOT}, + {"abc", query.substr(5, 3), Lexer::TokenType::TEXT}, + {"", query.substr(8, 1), Lexer::TokenType::DOT}, + {"fun", query.substr(9, 3), Lexer::TokenType::FUNCTION_NAME}, + {"", query.substr(12, 1), Lexer::TokenType::LPAREN}, + {"def", query.substr(13, 3), Lexer::TokenType::TEXT}, + {"", query.substr(16, 1), Lexer::TokenType::COMMA}, + {"ghi", query.substr(17, 3), Lexer::TokenType::TEXT}, + {"", query.substr(20, 1), Lexer::TokenType::RPAREN}}; Parser parser = Parser::Create(std::move(lexer_tokens)); ICING_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Node> tree_root, parser.ConsumeQuery()); @@ -1027,11 +1070,12 @@ TEST(ParserTest, QueryComplexMemberFunction) { } TEST(ParserTest, InvalidScoringToken) { - // Scoring: "1 + NOT 1" - std::vector<Lexer::LexerToken> lexer_tokens = {{"1", Lexer::TokenType::TEXT}, - {"", Lexer::TokenType::PLUS}, - {"", Lexer::TokenType::NOT}, - {"1", Lexer::TokenType::TEXT}}; + std::string_view scoring_exp = "1 + NOT 1"; + std::vector<Lexer::LexerToken> lexer_tokens = { + {"1", scoring_exp.substr(0, 1), Lexer::TokenType::TEXT}, + {"", scoring_exp.substr(2, 1), Lexer::TokenType::PLUS}, + {"", scoring_exp.substr(4, 3), Lexer::TokenType::NOT}, + {"1", scoring_exp.substr(8, 1), Lexer::TokenType::TEXT}}; Parser parser = Parser::Create(std::move(lexer_tokens)); EXPECT_THAT(parser.ConsumeScoring(), StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); diff --git a/icing/query/advanced_query_parser/pending-value.cc b/icing/query/advanced_query_parser/pending-value.cc index 903e12f..67bdc3a 100644 --- a/icing/query/advanced_query_parser/pending-value.cc +++ b/icing/query/advanced_query_parser/pending-value.cc @@ -36,7 +36,7 @@ libtextclassifier3::Status PendingValue::ParseInt() { "Unable to parse \"", query_term_.term, "\" as number.")); } data_type_ = DataType::kLong; - query_term_ = {"", false}; + query_term_ = {/*term=*/"", /*raw_term=*/"", /*is_prefix_val=*/false}; return libtextclassifier3::Status::OK; } diff --git a/icing/query/advanced_query_parser/pending-value.h b/icing/query/advanced_query_parser/pending-value.h index d18789d..1a6717e 100644 --- a/icing/query/advanced_query_parser/pending-value.h +++ b/icing/query/advanced_query_parser/pending-value.h @@ -38,17 +38,18 @@ enum class DataType { struct QueryTerm { std::string term; + std::string_view raw_term; bool is_prefix_val; }; // A holder for intermediate results when processing child nodes. struct PendingValue { - static PendingValue CreateStringPendingValue(QueryTerm query_term) { - return PendingValue(std::move(query_term), DataType::kString); + static PendingValue CreateStringPendingValue(QueryTerm str) { + return PendingValue(std::move(str), DataType::kString); } - static PendingValue CreateTextPendingValue(QueryTerm query_term) { - return PendingValue(std::move(query_term), DataType::kText); + static PendingValue CreateTextPendingValue(QueryTerm text) { + return PendingValue(std::move(text), DataType::kText); } PendingValue() : data_type_(DataType::kNone) {} @@ -125,7 +126,7 @@ struct PendingValue { private: explicit PendingValue(QueryTerm query_term, DataType data_type) - : query_term_({std::move(query_term)}), data_type_(data_type) {} + : query_term_(std::move(query_term)), data_type_(data_type) {} libtextclassifier3::Status CheckDataType(DataType required_data_type) const { if (data_type_ == required_data_type) { @@ -141,7 +142,7 @@ struct PendingValue { // iterator_ will be populated when data_type_ is kDocumentIterator. std::unique_ptr<DocHitInfoIterator> iterator_; - // string_vals_ will be populated when data_type_ is kStringList. + // string_vals_ will be populated when data_type_ kStringList. std::vector<std::string> string_vals_; // query_term_ will be populated when data_type_ is kString or kText diff --git a/icing/query/advanced_query_parser/query-visitor.cc b/icing/query/advanced_query_parser/query-visitor.cc index a1a9c38..58340c3 100644 --- a/icing/query/advanced_query_parser/query-visitor.cc +++ b/icing/query/advanced_query_parser/query-visitor.cc @@ -37,9 +37,12 @@ #include "icing/query/advanced_query_parser/lexer.h" #include "icing/query/advanced_query_parser/param.h" #include "icing/query/advanced_query_parser/parser.h" +#include "icing/query/advanced_query_parser/pending-value.h" +#include "icing/query/advanced_query_parser/util/string-util.h" #include "icing/query/query-features.h" #include "icing/schema/property-util.h" #include "icing/schema/section.h" +#include "icing/tokenization/token.h" #include "icing/tokenization/tokenizer.h" #include "icing/util/status-macros.h" @@ -54,32 +57,13 @@ struct CreateList { std::vector<std::string> values; values.reserve(args.size()); for (PendingValue& arg : args) { - QueryTerm val = std::move(arg).string_val().ValueOrDie(); - values.push_back(std::move(val.term)); + QueryTerm string_val = std::move(arg).string_val().ValueOrDie(); + values.push_back(std::move(string_val.term)); } return PendingValue(std::move(values)); } }; -libtextclassifier3::StatusOr<std::string> UnescapeStringValue( - std::string_view value) { - std::string result; - bool in_escape = false; - for (char c : value) { - if (in_escape) { - in_escape = false; - } else if (c == '\\') { - in_escape = true; - continue; - } else if (c == '"') { - return absl_ports::InvalidArgumentError( - "Encountered an unescaped quotation mark!"); - } - result += c; - } - return result; -} - bool IsNumericComparator(std::string_view operator_text) { if (operator_text.length() < 1 || operator_text.length() > 2) { return false; @@ -168,8 +152,10 @@ void QueryVisitor::PendingPropertyRestricts::AddValidRestricts( } libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>> -QueryVisitor::CreateTermIterator(QueryTerm query_term) { +QueryVisitor::CreateTermIterator(const QueryTerm& query_term) { TermMatchType::Code match_type = GetTermMatchType(query_term.is_prefix_val); + int unnormalized_term_start = + query_term.raw_term.data() - raw_query_text_.data(); if (!processing_not_) { // 1. Add term to property_query_terms_map if (pending_property_restricts_.has_active_property_restricts()) { @@ -183,13 +169,11 @@ QueryVisitor::CreateTermIterator(QueryTerm query_term) { // 2. If needed add term iterator to query_term_iterators_ map. if (needs_term_frequency_info_) { - // TODO(b/152934343) Save "term start index" into Node and PendingValue and - // pass it into index.GetIterator ICING_ASSIGN_OR_RETURN( std::unique_ptr<DocHitInfoIterator> term_iterator, - index_.GetIterator(query_term.term, /*term_start_index=*/0, - /*unnormalized_term_length=*/0, kSectionIdMaskAll, - match_type, needs_term_frequency_info_)); + index_.GetIterator(query_term.term, unnormalized_term_start, + query_term.raw_term.length(), kSectionIdMaskAll, + match_type_, needs_term_frequency_info_)); query_term_iterators_[query_term.term] = std::make_unique<DocHitInfoIteratorFilter>( std::move(term_iterator), &document_store_, &schema_store_, @@ -198,10 +182,8 @@ QueryVisitor::CreateTermIterator(QueryTerm query_term) { } // 3. Add the term iterator. - // TODO(b/152934343) Save "term start index" into Node and PendingValue and - // pass it into index.GetIterator - return index_.GetIterator(query_term.term, /*term_start_index=*/0, - /*unnormalized_term_length=*/0, kSectionIdMaskAll, + return index_.GetIterator(query_term.term, unnormalized_term_start, + query_term.raw_term.length(), kSectionIdMaskAll, match_type, needs_term_frequency_info_); } @@ -266,7 +248,7 @@ libtextclassifier3::StatusOr<PendingValue> QueryVisitor::SearchFunction( } else { QueryVisitor query_visitor(&index_, &numeric_index_, &document_store_, &schema_store_, &normalizer_, &tokenizer_, - filter_options_, match_type_, + query->raw_term, filter_options_, match_type_, needs_term_frequency_info_, pending_property_restricts_, processing_not_); tree_root->Accept(&query_visitor); @@ -353,24 +335,44 @@ QueryVisitor::PopPendingIterator() { // The tokenizer will produce 1+ tokens out of the text. The prefix operator // only applies to the final token. bool reached_final_token = !token_itr->Advance(); + // raw_text is the portion of text_value.raw_term that hasn't yet been + // matched to any of the tokens that we've processed. escaped_token will + // hold the portion of raw_text that corresponds to the current token that + // is being processed. + std::string_view raw_text = text_value.raw_term; + std::string_view raw_token; while (!reached_final_token) { std::vector<Token> tokens = token_itr->GetTokens(); - reached_final_token = !token_itr->Advance(); + if (tokens.size() > 1) { + // The tokenizer iterator iterates between token groups. In practice, + // the tokenizer used with QueryVisitor (PlainTokenizer) will always + // only produce a single token per token group. + return absl_ports::InvalidArgumentError( + "Encountered unexpected token group with >1 tokens."); + } - // The tokenizer iterator iterates between token groups. In practice, the - // tokenizer used with QueryVisitor (PlainTokenizer) will always only - // produce a single token per token group. - // For simplicity, we will apply the prefix operator to *all* tokens - // in the final token group. - for (const Token& token : tokens) { - normalized_term = normalizer_.NormalizeTerm(token.text); - ICING_ASSIGN_OR_RETURN( - std::unique_ptr<DocHitInfoIterator> iterator, - CreateTermIterator( - QueryTerm{std::move(normalized_term), - reached_final_token && text_value.is_prefix_val})); - iterators.push_back(std::move(iterator)); + reached_final_token = !token_itr->Advance(); + const Token& token = tokens.at(0); + if (reached_final_token && token.text.length() == raw_text.length()) { + // Unescaped tokens are strictly smaller than their escaped counterparts + // This means that if we're at the final token and token.length equals + // raw_text, then all of raw_text must correspond to this token. + raw_token = raw_text; + } else { + ICING_ASSIGN_OR_RETURN(raw_token, string_util::FindEscapedToken( + raw_text, token.text)); } + normalized_term = normalizer_.NormalizeTerm(token.text); + QueryTerm term_value{std::move(normalized_term), raw_token, + reached_final_token && text_value.is_prefix_val}; + ICING_ASSIGN_OR_RETURN(std::unique_ptr<DocHitInfoIterator> iterator, + CreateTermIterator(std::move(term_value))); + iterators.push_back(std::move(iterator)); + + // Remove escaped_token from raw_text now that we've processed + // raw_text. + const char* escaped_token_end = raw_token.data() + raw_token.length(); + raw_text = raw_text.substr(escaped_token_end - raw_text.data()); } // Finally, create an And Iterator. If there's only a single term here, then @@ -618,22 +620,23 @@ void QueryVisitor::VisitFunctionName(const FunctionNameNode* node) { void QueryVisitor::VisitString(const StringNode* node) { // A STRING node can only be a term. Create the iterator now. - auto unescaped_string_or = UnescapeStringValue(node->value()); + auto unescaped_string_or = string_util::UnescapeStringValue(node->value()); if (!unescaped_string_or.ok()) { pending_error_ = std::move(unescaped_string_or).status(); return; } std::string unescaped_string = std::move(unescaped_string_or).ValueOrDie(); - pending_values_.push(PendingValue::CreateStringPendingValue( - QueryTerm{std::move(unescaped_string), node->is_prefix()})); + QueryTerm val{std::move(unescaped_string), node->raw_value(), + node->is_prefix()}; + pending_values_.push(PendingValue::CreateStringPendingValue(std::move(val))); } void QueryVisitor::VisitText(const TextNode* node) { // TEXT nodes could either be a term (and will become DocHitInfoIteratorTerm) // or a property name. As such, we just push the TEXT value into pending // values and determine which it is at a later point. - pending_values_.push(PendingValue::CreateTextPendingValue( - QueryTerm{std::move(node->value()), node->is_prefix()})); + QueryTerm val{std::move(node->value()), node->raw_value(), node->is_prefix()}; + pending_values_.push(PendingValue::CreateTextPendingValue(std::move(val))); } void QueryVisitor::VisitMember(const MemberNode* node) { @@ -668,6 +671,8 @@ void QueryVisitor::VisitMember(const MemberNode* node) { libtextclassifier3::StatusOr<QueryTerm> member_or; std::vector<std::string> members; QueryTerm text_val; + const char* start = nullptr; + const char* end = nullptr; while (!pending_values_.empty() && !pending_values_.top().is_placeholder()) { member_or = PopPendingTextValue(); @@ -681,11 +686,19 @@ void QueryVisitor::VisitMember(const MemberNode* node) { "Cannot use prefix operator '*' within a property name!"); return; } + if (start == nullptr) { + start = text_val.raw_term.data(); + end = text_val.raw_term.data() + text_val.raw_term.length(); + } else { + start = std::min(start, text_val.raw_term.data()); + end = std::max(end, text_val.raw_term.data() + text_val.raw_term.length()); + } members.push_back(std::move(text_val.term)); } QueryTerm member; member.term = absl_ports::StrJoin(members.rbegin(), members.rend(), property_util::kPropertyPathSeparator); + member.raw_term = std::string_view(start, end - start); member.is_prefix_val = false; pending_value = PendingValue::CreateTextPendingValue(std::move(member)); } diff --git a/icing/query/advanced_query_parser/query-visitor.h b/icing/query/advanced_query_parser/query-visitor.h index 7498457..9fcaec0 100644 --- a/icing/query/advanced_query_parser/query-visitor.h +++ b/icing/query/advanced_query_parser/query-visitor.h @@ -49,12 +49,12 @@ class QueryVisitor : public AbstractSyntaxTreeVisitor { Index* index, const NumericIndex<int64_t>* numeric_index, const DocumentStore* document_store, const SchemaStore* schema_store, const Normalizer* normalizer, const Tokenizer* tokenizer, + std::string_view raw_query_text, DocHitInfoIteratorFilter::Options filter_options, TermMatchType::Code match_type, bool needs_term_frequency_info) : QueryVisitor(index, numeric_index, document_store, schema_store, - normalizer, tokenizer, filter_options, match_type, - needs_term_frequency_info, - + normalizer, tokenizer, raw_query_text, filter_options, + match_type, needs_term_frequency_info, PendingPropertyRestricts(), /*processing_not=*/false) {} @@ -105,9 +105,9 @@ class QueryVisitor : public AbstractSyntaxTreeVisitor { Index* index, const NumericIndex<int64_t>* numeric_index, const DocumentStore* document_store, const SchemaStore* schema_store, const Normalizer* normalizer, const Tokenizer* tokenizer, + std::string_view raw_query_text, DocHitInfoIteratorFilter::Options filter_options, TermMatchType::Code match_type, bool needs_term_frequency_info, - PendingPropertyRestricts pending_property_restricts, bool processing_not) : index_(*index), numeric_index_(*numeric_index), @@ -115,6 +115,7 @@ class QueryVisitor : public AbstractSyntaxTreeVisitor { schema_store_(*schema_store), normalizer_(*normalizer), tokenizer_(*tokenizer), + raw_query_text_(raw_query_text), filter_options_(std::move(filter_options)), match_type_(match_type), needs_term_frequency_info_(needs_term_frequency_info), @@ -133,7 +134,7 @@ class QueryVisitor : public AbstractSyntaxTreeVisitor { // - On success, a DocHitInfoIterator for the provided term // - INVALID_ARGUMENT if unable to create an iterator for the term. libtextclassifier3::StatusOr<std::unique_ptr<DocHitInfoIterator>> - CreateTermIterator(QueryTerm term); + CreateTermIterator(const QueryTerm& term); // Processes the PendingValue at the top of pending_values_, parses it into a // int64_t and pops the top. @@ -279,6 +280,7 @@ class QueryVisitor : public AbstractSyntaxTreeVisitor { const Normalizer& normalizer_; // Does not own! const Tokenizer& tokenizer_; // Does not own! + std::string_view raw_query_text_; DocHitInfoIteratorFilter::Options filter_options_; TermMatchType::Code match_type_; // Whether or not term_frequency information is needed. This affects: diff --git a/icing/query/advanced_query_parser/query-visitor_test.cc b/icing/query/advanced_query_parser/query-visitor_test.cc index 033e86b..a11c1c2 100644 --- a/icing/query/advanced_query_parser/query-visitor_test.cc +++ b/icing/query/advanced_query_parser/query-visitor_test.cc @@ -17,6 +17,7 @@ #include <cstdint> #include <limits> #include <memory> +#include <string_view> #include "icing/text_classifier/lib3/utils/base/status.h" #include "gmock/gmock.h" @@ -225,7 +226,7 @@ TEST_P(QueryVisitorTest, SimpleLessThan) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -262,7 +263,7 @@ TEST_P(QueryVisitorTest, SimpleLessThanEq) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -299,7 +300,7 @@ TEST_P(QueryVisitorTest, SimpleEqual) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -336,7 +337,7 @@ TEST_P(QueryVisitorTest, SimpleGreaterThanEq) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -373,7 +374,7 @@ TEST_P(QueryVisitorTest, SimpleGreaterThan) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -411,7 +412,7 @@ TEST_P(QueryVisitorTest, IntMinLessThanEqual) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -449,7 +450,7 @@ TEST_P(QueryVisitorTest, IntMaxGreaterThanEqual) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -488,7 +489,7 @@ TEST_P(QueryVisitorTest, NestedPropertyLessThan) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -510,7 +511,7 @@ TEST_P(QueryVisitorTest, IntParsingError) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -524,7 +525,7 @@ TEST_P(QueryVisitorTest, NotEqualsUnsupported) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -552,16 +553,19 @@ TEST_P(QueryVisitorTest, LessThanTooManyOperandsInvalid) { // Create an invalid AST for the query '3 < subscription.price 25' where '<' // has three operands - auto property_node = std::make_unique<TextNode>("subscription"); - auto subproperty_node = std::make_unique<TextNode>("price"); + std::string_view query = "3 < subscription.price 25"; + auto property_node = + std::make_unique<TextNode>("subscription", query.substr(4, 12)); + auto subproperty_node = + std::make_unique<TextNode>("price", query.substr(17, 5)); std::vector<std::unique_ptr<TextNode>> member_args; member_args.push_back(std::move(property_node)); member_args.push_back(std::move(subproperty_node)); auto member_node = std::make_unique<MemberNode>(std::move(member_args), /*function=*/nullptr); - auto value_node = std::make_unique<TextNode>("3"); - auto extra_value_node = std::make_unique<TextNode>("25"); + auto value_node = std::make_unique<TextNode>("3", query.substr(0, 1)); + auto extra_value_node = std::make_unique<TextNode>("25", query.substr(23, 2)); std::vector<std::unique_ptr<Node>> args; args.push_back(std::move(value_node)); args.push_back(std::move(member_node)); @@ -569,7 +573,7 @@ TEST_P(QueryVisitorTest, LessThanTooManyOperandsInvalid) { auto root_node = std::make_unique<NaryOperatorNode>("<", std::move(args)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -580,8 +584,11 @@ TEST_P(QueryVisitorTest, LessThanTooManyOperandsInvalid) { TEST_P(QueryVisitorTest, LessThanTooFewOperandsInvalid) { // Create an invalid AST for the query 'subscription.price <' where '<' // has a single operand - auto property_node = std::make_unique<TextNode>("subscription"); - auto subproperty_node = std::make_unique<TextNode>("price"); + std::string_view query = "subscription.price <"; + auto property_node = + std::make_unique<TextNode>("subscription", query.substr(0, 12)); + auto subproperty_node = + std::make_unique<TextNode>("price", query.substr(13, 5)); std::vector<std::unique_ptr<TextNode>> member_args; member_args.push_back(std::move(property_node)); member_args.push_back(std::move(subproperty_node)); @@ -593,7 +600,7 @@ TEST_P(QueryVisitorTest, LessThanTooFewOperandsInvalid) { auto root_node = std::make_unique<NaryOperatorNode>("<", std::move(args)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -624,7 +631,7 @@ TEST_P(QueryVisitorTest, LessThanNonExistentPropertyNotFound) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -640,7 +647,7 @@ TEST_P(QueryVisitorTest, LessThanNonExistentPropertyNotFound) { TEST_P(QueryVisitorTest, NeverVisitedReturnsInvalid) { QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), "", DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); EXPECT_THAT(std::move(query_visitor).ConsumeResults(), @@ -669,7 +676,7 @@ TEST_P(QueryVisitorTest, IntMinLessThanInvalid) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -699,7 +706,7 @@ TEST_P(QueryVisitorTest, IntMaxGreaterThanInvalid) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -714,7 +721,7 @@ TEST_P(QueryVisitorTest, NumericComparisonPropertyStringIsInvalid) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -776,7 +783,7 @@ TEST_P(QueryVisitorTest, NumericComparatorDoesntAffectLaterTerms) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -813,7 +820,7 @@ TEST_P(QueryVisitorTest, SingleTermTermFrequencyEnabled) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -865,7 +872,7 @@ TEST_P(QueryVisitorTest, SingleTermTermFrequencyDisabled) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/false); root_node->Accept(&query_visitor); @@ -917,7 +924,7 @@ TEST_P(QueryVisitorTest, SingleTermPrefix) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_EXACT, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -933,7 +940,7 @@ TEST_P(QueryVisitorTest, SingleTermPrefix) { ICING_ASSERT_OK_AND_ASSIGN(root_node, ParseQueryHelper(query)); QueryVisitor query_visitor_two( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_EXACT, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor_two); @@ -953,7 +960,7 @@ TEST_P(QueryVisitorTest, PrefixOperatorAfterPropertyReturnsInvalid) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -967,7 +974,7 @@ TEST_P(QueryVisitorTest, PrefixOperatorAfterNumericValueReturnsInvalid) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -981,7 +988,7 @@ TEST_P(QueryVisitorTest, PrefixOperatorAfterPropertyRestrictReturnsInvalid) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -1019,7 +1026,7 @@ TEST_P(QueryVisitorTest, SegmentationWithPrefix) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_EXACT, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -1042,7 +1049,7 @@ TEST_P(QueryVisitorTest, SegmentationWithPrefix) { ICING_ASSERT_OK_AND_ASSIGN(root_node, ParseQueryHelper(query)); QueryVisitor query_visitor_two( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_EXACT, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor_two); @@ -1079,7 +1086,7 @@ TEST_P(QueryVisitorTest, SingleVerbatimTerm) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -1120,7 +1127,7 @@ TEST_P(QueryVisitorTest, SingleVerbatimTermPrefix) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_EXACT, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -1172,7 +1179,7 @@ TEST_P(QueryVisitorTest, VerbatimTermEscapingQuote) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -1218,7 +1225,7 @@ TEST_P(QueryVisitorTest, VerbatimTermEscapingEscape) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -1266,7 +1273,7 @@ TEST_P(QueryVisitorTest, VerbatimTermEscapingNonSpecialChar) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -1287,7 +1294,7 @@ TEST_P(QueryVisitorTest, VerbatimTermEscapingNonSpecialChar) { ICING_ASSERT_OK_AND_ASSIGN(root_node, ParseQueryHelper(query)); QueryVisitor query_visitor_two( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor_two); @@ -1336,7 +1343,7 @@ TEST_P(QueryVisitorTest, VerbatimTermNewLine) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -1356,7 +1363,7 @@ TEST_P(QueryVisitorTest, VerbatimTermNewLine) { ICING_ASSERT_OK_AND_ASSIGN(root_node, ParseQueryHelper(query)); QueryVisitor query_visitor_two( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor_two); @@ -1399,7 +1406,7 @@ TEST_P(QueryVisitorTest, VerbatimTermEscapingComplex) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -1450,7 +1457,7 @@ TEST_P(QueryVisitorTest, SingleMinusTerm) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -1497,7 +1504,7 @@ TEST_P(QueryVisitorTest, SingleNotTerm) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -1549,7 +1556,7 @@ TEST_P(QueryVisitorTest, NestedNotTerms) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -1615,7 +1622,7 @@ TEST_P(QueryVisitorTest, DeeplyNestedNotTerms) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -1653,7 +1660,7 @@ TEST_P(QueryVisitorTest, ImplicitAndTerms) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -1691,7 +1698,7 @@ TEST_P(QueryVisitorTest, ExplicitAndTerms) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -1729,7 +1736,7 @@ TEST_P(QueryVisitorTest, OrTerms) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -1769,7 +1776,7 @@ TEST_P(QueryVisitorTest, AndOrTermPrecedence) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -1789,7 +1796,7 @@ TEST_P(QueryVisitorTest, AndOrTermPrecedence) { ICING_ASSERT_OK_AND_ASSIGN(root_node, ParseQueryHelper(query)); QueryVisitor query_visitor_two( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor_two); @@ -1808,7 +1815,7 @@ TEST_P(QueryVisitorTest, AndOrTermPrecedence) { ICING_ASSERT_OK_AND_ASSIGN(root_node, ParseQueryHelper(query)); QueryVisitor query_visitor_three( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor_three); @@ -1863,7 +1870,7 @@ TEST_P(QueryVisitorTest, AndOrNotPrecedence) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -1882,7 +1889,7 @@ TEST_P(QueryVisitorTest, AndOrNotPrecedence) { ICING_ASSERT_OK_AND_ASSIGN(root_node, ParseQueryHelper(query)); QueryVisitor query_visitor_two( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor_two); @@ -1943,7 +1950,7 @@ TEST_P(QueryVisitorTest, PropertyFilter) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -2011,7 +2018,7 @@ TEST_F(QueryVisitorTest, MultiPropertyFilter) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -2051,7 +2058,7 @@ TEST_P(QueryVisitorTest, PropertyFilterStringIsInvalid) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -2105,7 +2112,7 @@ TEST_P(QueryVisitorTest, PropertyFilterNonNormalized) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -2168,7 +2175,7 @@ TEST_P(QueryVisitorTest, PropertyFilterWithGrouping) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -2231,7 +2238,7 @@ TEST_P(QueryVisitorTest, ValidNestedPropertyFilter) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -2251,7 +2258,7 @@ TEST_P(QueryVisitorTest, ValidNestedPropertyFilter) { ICING_ASSERT_OK_AND_ASSIGN(root_node, ParseQueryHelper(query)); QueryVisitor query_visitor_two( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor_two); @@ -2313,7 +2320,7 @@ TEST_P(QueryVisitorTest, InvalidNestedPropertyFilter) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -2333,7 +2340,7 @@ TEST_P(QueryVisitorTest, InvalidNestedPropertyFilter) { ICING_ASSERT_OK_AND_ASSIGN(root_node, ParseQueryHelper(query)); QueryVisitor query_visitor_two( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor_two); @@ -2395,7 +2402,7 @@ TEST_P(QueryVisitorTest, NotWithPropertyFilter) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -2415,7 +2422,7 @@ TEST_P(QueryVisitorTest, NotWithPropertyFilter) { ICING_ASSERT_OK_AND_ASSIGN(root_node, ParseQueryHelper(query)); QueryVisitor query_visitor_two( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor_two); @@ -2478,7 +2485,7 @@ TEST_P(QueryVisitorTest, PropertyFilterWithNot) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -2501,7 +2508,7 @@ TEST_P(QueryVisitorTest, PropertyFilterWithNot) { ICING_ASSERT_OK_AND_ASSIGN(root_node, ParseQueryHelper(query)); QueryVisitor query_visitor_two( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor_two); @@ -2579,7 +2586,7 @@ TEST_P(QueryVisitorTest, SegmentationTest) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -2692,7 +2699,7 @@ TEST_P(QueryVisitorTest, PropertyRestrictsPopCorrectly) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -2802,7 +2809,7 @@ TEST_P(QueryVisitorTest, UnsatisfiablePropertyRestrictsPopCorrectly) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -2825,7 +2832,7 @@ TEST_F(QueryVisitorTest, UnsupportedFunctionReturnsInvalidArgument) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -2839,7 +2846,7 @@ TEST_F(QueryVisitorTest, SearchFunctionTooFewArgumentsReturnsInvalidArgument) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -2853,7 +2860,7 @@ TEST_F(QueryVisitorTest, SearchFunctionTooManyArgumentsReturnsInvalidArgument) { ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -2869,7 +2876,7 @@ TEST_F(QueryVisitorTest, ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -2881,7 +2888,7 @@ TEST_F(QueryVisitorTest, ICING_ASSERT_OK_AND_ASSIGN(root_node, ParseQueryHelper(query)); QueryVisitor query_visitor_two( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor_two); @@ -2897,7 +2904,7 @@ TEST_F(QueryVisitorTest, ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -2909,7 +2916,7 @@ TEST_F(QueryVisitorTest, ICING_ASSERT_OK_AND_ASSIGN(root_node, ParseQueryHelper(query)); QueryVisitor query_visitor_two( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor_two); @@ -2924,7 +2931,7 @@ TEST_F(QueryVisitorTest, ParseQueryHelper(query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -2984,7 +2991,7 @@ TEST_F(QueryVisitorTest, SearchFunctionNestedFunctionCalls) { ParseQueryHelper(level_two_query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), level_two_query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -3008,7 +3015,7 @@ TEST_F(QueryVisitorTest, SearchFunctionNestedFunctionCalls) { QueryVisitor query_visitor_two( index_.get(), numeric_index_.get(), document_store_.get(), schema_store_.get(), normalizer_.get(), tokenizer_.get(), - DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, + level_three_query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor_two); ICING_ASSERT_OK_AND_ASSIGN(query_results, @@ -3031,7 +3038,7 @@ TEST_F(QueryVisitorTest, SearchFunctionNestedFunctionCalls) { QueryVisitor query_visitor_three( index_.get(), numeric_index_.get(), document_store_.get(), schema_store_.get(), normalizer_.get(), tokenizer_.get(), - DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, + level_four_query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor_three); ICING_ASSERT_OK_AND_ASSIGN(query_results, @@ -3148,7 +3155,7 @@ TEST_F(QueryVisitorTest, SearchFunctionNestedPropertyRestrictsNarrowing) { ParseQueryHelper(level_one_query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), level_one_query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -3179,7 +3186,7 @@ TEST_F(QueryVisitorTest, SearchFunctionNestedPropertyRestrictsNarrowing) { ICING_ASSERT_OK_AND_ASSIGN(root_node, ParseQueryHelper(level_two_query)); QueryVisitor query_visitor_two( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), level_two_query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor_two); @@ -3205,7 +3212,7 @@ TEST_F(QueryVisitorTest, SearchFunctionNestedPropertyRestrictsNarrowing) { QueryVisitor query_visitor_three( index_.get(), numeric_index_.get(), document_store_.get(), schema_store_.get(), normalizer_.get(), tokenizer_.get(), - DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, + level_three_query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor_three); ICING_ASSERT_OK_AND_ASSIGN(query_results, @@ -3322,7 +3329,7 @@ TEST_F(QueryVisitorTest, SearchFunctionNestedPropertyRestrictsExpandinging) { ParseQueryHelper(level_one_query)); QueryVisitor query_visitor( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), level_one_query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor); @@ -3345,7 +3352,7 @@ TEST_F(QueryVisitorTest, SearchFunctionNestedPropertyRestrictsExpandinging) { ICING_ASSERT_OK_AND_ASSIGN(root_node, ParseQueryHelper(level_two_query)); QueryVisitor query_visitor_two( index_.get(), numeric_index_.get(), document_store_.get(), - schema_store_.get(), normalizer_.get(), tokenizer_.get(), + schema_store_.get(), normalizer_.get(), tokenizer_.get(), level_two_query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor_two); @@ -3370,7 +3377,7 @@ TEST_F(QueryVisitorTest, SearchFunctionNestedPropertyRestrictsExpandinging) { QueryVisitor query_visitor_three( index_.get(), numeric_index_.get(), document_store_.get(), schema_store_.get(), normalizer_.get(), tokenizer_.get(), - DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, + level_three_query, DocHitInfoIteratorFilter::Options(), TERM_MATCH_PREFIX, /*needs_term_frequency_info_=*/true); root_node->Accept(&query_visitor_three); ICING_ASSERT_OK_AND_ASSIGN(query_results, diff --git a/icing/query/advanced_query_parser/util/string-util.cc b/icing/query/advanced_query_parser/util/string-util.cc new file mode 100644 index 0000000..9af2ed6 --- /dev/null +++ b/icing/query/advanced_query_parser/util/string-util.cc @@ -0,0 +1,106 @@ +// Copyright (C) 2023 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/query/advanced_query_parser/util/string-util.h" + +#include "icing/absl_ports/canonical_errors.h" +#include "icing/absl_ports/str_cat.h" + +namespace icing { +namespace lib { + +namespace string_util { + +libtextclassifier3::StatusOr<std::string> UnescapeStringValue( + std::string_view value) { + std::string result; + bool in_escape = false; + for (char c : value) { + if (in_escape) { + in_escape = false; + } else if (c == '\\') { + in_escape = true; + continue; + } else if (c == '"') { + return absl_ports::InvalidArgumentError( + "Encountered an unescaped quotation mark!"); + } + result += c; + } + return result; +} + +libtextclassifier3::StatusOr<std::string_view> FindEscapedToken( + std::string_view escaped_string, std::string_view unescaped_token) { + if (unescaped_token.empty()) { + return absl_ports::InvalidArgumentError( + "Cannot find escaped token in empty unescaped token."); + } + + // Find the start of unescaped_token within the escaped_string + const char* esc_string_end = escaped_string.data() + escaped_string.length(); + size_t pos = escaped_string.find(unescaped_token[0]); + const char* esc_token_start = (pos == std::string_view::npos) + ? esc_string_end + : escaped_string.data() + pos; + const char* esc_token_cur = esc_token_start; + const char* possible_next_start = nullptr; + bool is_escaped = false; + int i = 0; + for (; i < unescaped_token.length() && esc_token_cur < esc_string_end; + ++esc_token_cur) { + if (esc_token_cur != esc_token_start && + *esc_token_cur == unescaped_token[0] && + possible_next_start == nullptr) { + possible_next_start = esc_token_cur; + } + + // Every char in unescaped_token should either be an escape or match the + // next char in unescaped_token. + if (!is_escaped && *esc_token_cur == '\\') { + is_escaped = true; + } else if (*esc_token_cur == unescaped_token[i]) { + is_escaped = false; + ++i; + } else { + // No match. If we don't have a possible_next_start, then try to find one. + if (possible_next_start == nullptr) { + pos = escaped_string.find(unescaped_token[0], + esc_token_cur - escaped_string.data()); + if (pos == std::string_view::npos) { + break; + } + esc_token_start = escaped_string.data() + pos; + } else { + esc_token_start = possible_next_start; + possible_next_start = nullptr; + } + // esc_token_start has been reset to a char that equals unescaped_token[0] + // The for loop above will advance esc_token_cur so set i to 1. + i = 1; + esc_token_cur = esc_token_start; + } + } + if (i != unescaped_token.length()) { + return absl_ports::InvalidArgumentError( + absl_ports::StrCat("Couldn't match chars at token=", unescaped_token, + ") and raw_text=", escaped_string)); + } + return std::string_view(esc_token_start, esc_token_cur - esc_token_start); +} + +} // namespace string_util + +} // namespace lib +} // namespace icing
\ No newline at end of file diff --git a/icing/query/advanced_query_parser/util/string-util.h b/icing/query/advanced_query_parser/util/string-util.h new file mode 100644 index 0000000..09fb451 --- /dev/null +++ b/icing/query/advanced_query_parser/util/string-util.h @@ -0,0 +1,49 @@ +// Copyright (C) 2023 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ICING_QUERY_ADVANCED_QUERY_PARSER__STRING_UTIL_H_ +#define ICING_QUERY_ADVANCED_QUERY_PARSER__STRING_UTIL_H_ + +#include <string> +#include <string_view> + +#include "icing/text_classifier/lib3/utils/base/statusor.h" + +namespace icing { +namespace lib { + +namespace string_util { + +// Returns: +// - On success, value with the escapes removed. +// - INVALID_ARGUMENT if an non-escaped quote is encountered. +// Ex. "fo\\\\o" -> "fo\\o" +libtextclassifier3::StatusOr<std::string> UnescapeStringValue( + std::string_view value); + +// Returns: +// - On success, string_view pointing to the segment of escaped_string that, +// if unescaped, would match unescaped_token. +// - INVALID_ARGUMENT +// Ex. escaped_string="foo b\\a\\\"r baz", unescaped_token="ba\"r" +// returns "b\\a\\\"r" +libtextclassifier3::StatusOr<std::string_view> FindEscapedToken( + std::string_view escaped_string, std::string_view unescaped_token); + +} // namespace string_util + +} // namespace lib +} // namespace icing + +#endif // ICING_QUERY_ADVANCED_QUERY_PARSER__STRING_UTIL_H_ diff --git a/icing/query/advanced_query_parser/util/string-util_test.cc b/icing/query/advanced_query_parser/util/string-util_test.cc new file mode 100644 index 0000000..a7ccf3e --- /dev/null +++ b/icing/query/advanced_query_parser/util/string-util_test.cc @@ -0,0 +1,125 @@ +// Copyright (C) 2023 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/query/advanced_query_parser/util/string-util.h" + +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include "icing/testing/common-matchers.h" + +namespace icing { +namespace lib { + +namespace { + +using ::testing::Eq; +using ::testing::IsEmpty; + +TEST(StringUtilTest, UnescapeStringEmptyString) { + EXPECT_THAT(string_util::UnescapeStringValue(""), IsOkAndHolds(IsEmpty())); +} + +TEST(StringUtilTest, UnescapeStringStringWithNoEscapes) { + EXPECT_THAT(string_util::UnescapeStringValue("foo"), IsOkAndHolds("foo")); + EXPECT_THAT(string_util::UnescapeStringValue("f o o"), IsOkAndHolds("f o o")); + EXPECT_THAT(string_util::UnescapeStringValue("f\to\to"), + IsOkAndHolds("f\to\to")); + EXPECT_THAT(string_util::UnescapeStringValue("f.o.o"), IsOkAndHolds("f.o.o")); +} + +TEST(StringUtilTest, UnescapeStringStringWithEscapes) { + EXPECT_THAT(string_util::UnescapeStringValue("f\\oo"), IsOkAndHolds("foo")); + EXPECT_THAT(string_util::UnescapeStringValue("f\\\\oo"), + IsOkAndHolds("f\\oo")); + EXPECT_THAT(string_util::UnescapeStringValue("f\\\"oo"), + IsOkAndHolds("f\"oo")); + EXPECT_THAT(string_util::UnescapeStringValue("foo\\"), IsOkAndHolds("foo")); + EXPECT_THAT(string_util::UnescapeStringValue("foo b\\a\\\"r baz"), + IsOkAndHolds("foo ba\"r baz")); + EXPECT_THAT(string_util::UnescapeStringValue("bar b\\aar bar\\s bart"), + IsOkAndHolds("bar baar bars bart")); + EXPECT_THAT(string_util::UnescapeStringValue("\\\\\\\\a"), + IsOkAndHolds("\\\\a")); +} + +TEST(StringUtilTest, UnescapeStringQuoteWithoutEscape) { + EXPECT_THAT(string_util::UnescapeStringValue("f\\o\"o"), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + EXPECT_THAT(string_util::UnescapeStringValue("f\"oo"), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); +} + +TEST(StringUtilTest, FindEscapedTokenEmptyUnescapedToken) { + EXPECT_THAT(string_util::FindEscapedToken("foo b\\a\\\"r baz", ""), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); +} + +TEST(StringUtilTest, FindEscapedTokenTokenNotPresent) { + EXPECT_THAT(string_util::FindEscapedToken("foo b\\a\\\"r baz", "elephant"), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + EXPECT_THAT(string_util::FindEscapedToken("foo b\\a\\\"r baz", "bat"), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + EXPECT_THAT(string_util::FindEscapedToken("foo b\\a\\\"r baz", "taz"), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + EXPECT_THAT(string_util::FindEscapedToken("foo b\\a\\\"r baz", "bazz"), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); +} + +TEST(StringUtilTest, FindEscapedTokenMatchInMiddleToken) { + EXPECT_THAT(string_util::FindEscapedToken("babar", "bar"), + IsOkAndHolds("bar")); +} + +TEST(StringUtilTest, FindEscapedTokenMatches) { + EXPECT_THAT(string_util::FindEscapedToken("foo b\\a\\\"r baz", "ba\"r"), + IsOkAndHolds("b\\a\\\"r")); + EXPECT_THAT(string_util::FindEscapedToken("\\\\\\\\a", "\\\\a"), + IsOkAndHolds("\\\\\\\\a")); +} + +TEST(StringUtilTest, FindEscapedTokenTraversesThroughEscapedText) { + std::string_view escaped_text = "bar b\\aar bar\\s bart"; + ICING_ASSERT_OK_AND_ASSIGN( + std::string_view result, + string_util::FindEscapedToken(escaped_text, "bar")); + // escaped_text = "bar b\\aar bar\\s bart"; + // escaped_token ^ ^ + EXPECT_THAT(result, Eq("bar")); + + // escaped_text = "b\\aar bar\\s bart"; + // escaped_token ^ ^ + const char* result_end = result.data() + result.length(); + escaped_text = escaped_text.substr(result_end - escaped_text.data()); + ICING_ASSERT_OK_AND_ASSIGN( + result, string_util::FindEscapedToken(escaped_text, "bar")); + EXPECT_THAT(result, Eq("bar")); + + // escaped_text = "\\s bart"; + // escaped_token ^ ^ + result_end = result.data() + result.length(); + escaped_text = escaped_text.substr(result_end - escaped_text.data()); + ICING_ASSERT_OK_AND_ASSIGN( + result, string_util::FindEscapedToken(escaped_text, "bar")); + EXPECT_THAT(result, Eq("bar")); + + result_end = result.data() + result.length(); + escaped_text = escaped_text.substr(result_end - escaped_text.data()); + EXPECT_THAT(string_util::FindEscapedToken(escaped_text, "bar"), + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); +} + +} // namespace + +} // namespace lib +} // namespace icing
\ No newline at end of file diff --git a/icing/query/query-processor.cc b/icing/query/query-processor.cc index 9b03a0e..6760fad 100644 --- a/icing/query/query-processor.cc +++ b/icing/query/query-processor.cc @@ -203,8 +203,8 @@ libtextclassifier3::StatusOr<QueryResults> QueryProcessor::ParseAdvancedQuery( ranking_strategy == ScoringSpecProto::RankingStrategy::RELEVANCE_SCORE; QueryVisitor query_visitor( &index_, &numeric_index_, &document_store_, &schema_store_, &normalizer_, - plain_tokenizer.get(), std::move(options), search_spec.term_match_type(), - needs_term_frequency_info); + plain_tokenizer.get(), search_spec.query(), std::move(options), + search_spec.term_match_type(), needs_term_frequency_info); tree_root->Accept(&query_visitor); return std::move(query_visitor).ConsumeResults(); } diff --git a/icing/query/suggestion-processor_test.cc b/icing/query/suggestion-processor_test.cc index e161099..d4ecec0 100644 --- a/icing/query/suggestion-processor_test.cc +++ b/icing/query/suggestion-processor_test.cc @@ -14,6 +14,9 @@ #include "icing/query/suggestion-processor.h" +#include <string> +#include <vector> + #include "gmock/gmock.h" #include "icing/document-builder.h" #include "icing/index/numeric/dummy-numeric-index.h" @@ -36,10 +39,19 @@ namespace lib { namespace { using ::testing::IsEmpty; -using ::testing::SizeIs; using ::testing::Test; using ::testing::UnorderedElementsAre; +std::vector<std::string> RetrieveSuggestionsText( + const std::vector<TermMetadata>& terms) { + std::vector<std::string> suggestions; + suggestions.reserve(terms.size()); + for (const TermMetadata& term : terms) { + suggestions.push_back(term.content); + } + return suggestions; +} + class SuggestionProcessorTest : public Test { protected: SuggestionProcessorTest() @@ -181,8 +193,7 @@ TEST_F(SuggestionProcessorTest, MultipleTermsTest_And) { ICING_ASSERT_OK_AND_ASSIGN( std::vector<TermMetadata> terms, suggestion_processor_->QuerySuggestions(suggestion_spec)); - EXPECT_THAT(terms.at(0).content, "bar foo"); - EXPECT_THAT(terms, SizeIs(1)); + EXPECT_THAT(RetrieveSuggestionsText(terms), UnorderedElementsAre("bar foo")); } TEST_F(SuggestionProcessorTest, MultipleTermsTest_AndNary) { @@ -228,8 +239,8 @@ TEST_F(SuggestionProcessorTest, MultipleTermsTest_AndNary) { ICING_ASSERT_OK_AND_ASSIGN( std::vector<TermMetadata> terms, suggestion_processor_->QuerySuggestions(suggestion_spec)); - EXPECT_THAT(terms.at(0).content, "bar cat foo"); - EXPECT_THAT(terms, SizeIs(1)); + EXPECT_THAT(RetrieveSuggestionsText(terms), + UnorderedElementsAre("bar cat foo")); } TEST_F(SuggestionProcessorTest, MultipleTermsTest_Or) { @@ -277,11 +288,7 @@ TEST_F(SuggestionProcessorTest, MultipleTermsTest_Or) { ICING_ASSERT_OK_AND_ASSIGN( std::vector<TermMetadata> terms, suggestion_processor_->QuerySuggestions(suggestion_spec)); - std::vector<std::string> suggestions; - for (TermMetadata term : terms) { - suggestions.push_back(term.content); - } - EXPECT_THAT(suggestions, + EXPECT_THAT(RetrieveSuggestionsText(terms), UnorderedElementsAre("bar OR cat fo", "bar OR cat foo")); } @@ -340,14 +347,11 @@ TEST_F(SuggestionProcessorTest, MultipleTermsTest_OrNary) { ICING_ASSERT_OK_AND_ASSIGN( std::vector<TermMetadata> terms, suggestion_processor_->QuerySuggestions(suggestion_spec)); - std::vector<std::string> suggestions; - for (TermMetadata term : terms) { - suggestions.push_back(term.content); - } // "fo" in document1, "foo" in document2 and "fool" in document3 could match. - EXPECT_THAT(suggestions, UnorderedElementsAre("bar OR cat OR lot fo", - "bar OR cat OR lot foo", - "bar OR cat OR lot fool")); + EXPECT_THAT( + RetrieveSuggestionsText(terms), + UnorderedElementsAre("bar OR cat OR lot fo", "bar OR cat OR lot foo", + "bar OR cat OR lot fool")); } TEST_F(SuggestionProcessorTest, MultipleTermsTest_NormalizedTerm) { @@ -394,22 +398,17 @@ TEST_F(SuggestionProcessorTest, MultipleTermsTest_NormalizedTerm) { ICING_ASSERT_OK_AND_ASSIGN( std::vector<TermMetadata> terms, suggestion_processor_->QuerySuggestions(suggestion_spec)); - std::vector<std::string> suggestions; - for (TermMetadata term : terms) { - suggestions.push_back(term.content); - } // The term is normalized. - EXPECT_THAT(suggestions, UnorderedElementsAre("bar foo", "bar fool")); - suggestions.clear(); + EXPECT_THAT(RetrieveSuggestionsText(terms), + UnorderedElementsAre("bar foo", "bar fool")); + // Search for "bar AND ḞÖ" suggestion_spec.set_prefix("bar ḞÖ"); ICING_ASSERT_OK_AND_ASSIGN( terms, suggestion_processor_->QuerySuggestions(suggestion_spec)); - for (TermMetadata term : terms) { - suggestions.push_back(term.content); - } // The term is normalized. - EXPECT_THAT(suggestions, UnorderedElementsAre("bar foo", "bar fool")); + EXPECT_THAT(RetrieveSuggestionsText(terms), + UnorderedElementsAre("bar foo", "bar fool")); } TEST_F(SuggestionProcessorTest, NonExistentPrefixTest) { @@ -441,7 +440,6 @@ TEST_F(SuggestionProcessorTest, NonExistentPrefixTest) { ICING_ASSERT_OK_AND_ASSIGN( std::vector<TermMetadata> terms, suggestion_processor_->QuerySuggestions(suggestion_spec)); - EXPECT_THAT(terms, IsEmpty()); } @@ -474,7 +472,6 @@ TEST_F(SuggestionProcessorTest, PrefixTrailingSpaceTest) { ICING_ASSERT_OK_AND_ASSIGN( std::vector<TermMetadata> terms, suggestion_processor_->QuerySuggestions(suggestion_spec)); - EXPECT_THAT(terms, IsEmpty()); } @@ -506,23 +503,22 @@ TEST_F(SuggestionProcessorTest, NormalizePrefixTest) { ICING_ASSERT_OK_AND_ASSIGN( std::vector<TermMetadata> terms, suggestion_processor_->QuerySuggestions(suggestion_spec)); - EXPECT_THAT(terms.at(0).content, "foo"); + EXPECT_THAT(RetrieveSuggestionsText(terms), UnorderedElementsAre("foo")); suggestion_spec.set_prefix("fO"); ICING_ASSERT_OK_AND_ASSIGN( terms, suggestion_processor_->QuerySuggestions(suggestion_spec)); - EXPECT_THAT(terms.at(0).content, "foo"); + EXPECT_THAT(RetrieveSuggestionsText(terms), UnorderedElementsAre("foo")); suggestion_spec.set_prefix("Fo"); ICING_ASSERT_OK_AND_ASSIGN( terms, suggestion_processor_->QuerySuggestions(suggestion_spec)); - EXPECT_THAT(terms.at(0).content, "foo"); + EXPECT_THAT(RetrieveSuggestionsText(terms), UnorderedElementsAre("foo")); suggestion_spec.set_prefix("FO"); ICING_ASSERT_OK_AND_ASSIGN( terms, suggestion_processor_->QuerySuggestions(suggestion_spec)); - - EXPECT_THAT(terms.at(0).content, "foo"); + EXPECT_THAT(RetrieveSuggestionsText(terms), UnorderedElementsAre("foo")); } TEST_F(SuggestionProcessorTest, ParenthesesOperatorPrefixTest) { @@ -593,20 +589,34 @@ TEST_F(SuggestionProcessorTest, OtherSpecialPrefixTest) { suggestion_spec.mutable_scoring_spec()->set_scoring_match_type( TermMatchType::PREFIX); - ICING_ASSERT_OK_AND_ASSIGN( - std::vector<TermMetadata> terms, - suggestion_processor_->QuerySuggestions(suggestion_spec)); - EXPECT_THAT(terms, IsEmpty()); + auto terms_or = suggestion_processor_->QuerySuggestions(suggestion_spec); + if (SearchSpecProto::default_instance().search_type() == + SearchSpecProto::SearchType::ICING_RAW_QUERY) { + ICING_ASSERT_OK_AND_ASSIGN(std::vector<TermMetadata> terms, terms_or); + EXPECT_THAT(terms, IsEmpty()); + } else { + EXPECT_THAT(terms_or, + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + } + // TODO(b/208654892): Update handling for hyphens to only consider it a hyphen + // within a TEXT token (rather than a MINUS token) when surrounded on both + // sides by TEXT rather than just preceded by TEXT. suggestion_spec.set_prefix("f-"); - ICING_ASSERT_OK_AND_ASSIGN( - terms, suggestion_processor_->QuerySuggestions(suggestion_spec)); + terms_or = suggestion_processor_->QuerySuggestions(suggestion_spec); + ICING_ASSERT_OK_AND_ASSIGN(std::vector<TermMetadata> terms, terms_or); EXPECT_THAT(terms, IsEmpty()); suggestion_spec.set_prefix("f OR"); - ICING_ASSERT_OK_AND_ASSIGN( - terms, suggestion_processor_->QuerySuggestions(suggestion_spec)); - EXPECT_THAT(terms, IsEmpty()); + terms_or = suggestion_processor_->QuerySuggestions(suggestion_spec); + if (SearchSpecProto::default_instance().search_type() == + SearchSpecProto::SearchType::ICING_RAW_QUERY) { + ICING_ASSERT_OK_AND_ASSIGN(std::vector<TermMetadata> terms, terms_or); + EXPECT_THAT(terms, IsEmpty()); + } else { + EXPECT_THAT(terms_or, + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + } } TEST_F(SuggestionProcessorTest, InvalidPrefixTest) { @@ -635,10 +645,15 @@ TEST_F(SuggestionProcessorTest, InvalidPrefixTest) { suggestion_spec.mutable_scoring_spec()->set_scoring_match_type( TermMatchType::PREFIX); - ICING_ASSERT_OK_AND_ASSIGN( - std::vector<TermMetadata> terms, - suggestion_processor_->QuerySuggestions(suggestion_spec)); - EXPECT_THAT(terms, IsEmpty()); + auto terms_or = suggestion_processor_->QuerySuggestions(suggestion_spec); + if (SearchSpecProto::default_instance().search_type() == + SearchSpecProto::SearchType::ICING_RAW_QUERY) { + ICING_ASSERT_OK_AND_ASSIGN(std::vector<TermMetadata> terms, terms_or); + EXPECT_THAT(terms, IsEmpty()); + } else { + EXPECT_THAT(terms_or, + StatusIs(libtextclassifier3::StatusCode::INVALID_ARGUMENT)); + } } } // namespace diff --git a/proto/icing/proto/search.proto b/proto/icing/proto/search.proto index c9e2b1d..8bdbf0c 100644 --- a/proto/icing/proto/search.proto +++ b/proto/icing/proto/search.proto @@ -85,7 +85,8 @@ message SearchSpecProto { // enable testing. // TODO(b/208654892) Remove this field once EXPERIMENTAL_ICING_ADVANCED_QUERY // is fully supported. - optional SearchType.Code search_type = 6 [default = ICING_RAW_QUERY]; + optional SearchType.Code search_type = 6 + [default = EXPERIMENTAL_ICING_ADVANCED_QUERY]; // OPTIONAL: If this field is present, join documents based on a nested // SearchSpec. diff --git a/synced_AOSP_CL_number.txt b/synced_AOSP_CL_number.txt index 232fbe0..ae780f8 100644 --- a/synced_AOSP_CL_number.txt +++ b/synced_AOSP_CL_number.txt @@ -1 +1 @@ -set(synced_AOSP_CL_number=-514555603) +set(synced_AOSP_CL_number=515353673) |