diff options
author | Tim Barron <tjbarron@google.com> | 2022-12-12 18:03:05 -0800 |
---|---|---|
committer | Tim Barron <tjbarron@google.com> | 2022-12-12 18:03:05 -0800 |
commit | 8ddc32ad433ea147de80dcfac2afe58962360f18 (patch) | |
tree | 50c30cb98396499bf1d6caf33b383f7f4bbc7e58 /icing/query/advanced_query_parser/parser.cc | |
parent | 2658f90984737e5bf6c76d82024103dccd4d51c6 (diff) | |
download | icing-8ddc32ad433ea147de80dcfac2afe58962360f18.tar.gz |
Sync from upstream.
Descriptions:
======================================================================
Add ScoringSpec into JoinSpec. Rename joined_document to child_document.
======================================================================
Create JoinedScoredDocumentHit class and refactor ScoredDocumentHitsRanker.
======================================================================
Implement initial Join workflow
======================================================================
Implement the Lexer for Icing Advanced Query Language
======================================================================
Create struct Options for PersistentHashMap
======================================================================
Premapping FileBackedVector
======================================================================
Create class PersistentHashMapKeyMapper
======================================================================
Add integer sections into TokenizedDocument and rename string sections
======================================================================
Create NumericIndex interface and DocHitInfoIteratorNumeric
======================================================================
Implement DummyNumericIndex and unit test
======================================================================
Change PostingListAccessor::Finalize to rvalue member function
======================================================================
Define the Abstract Syntax Tree for Icing's list_filter parser.
======================================================================
Refactor query processing and score
======================================================================
Refactor IcingSearchEngine for AppSearch Dynamite Module 0p APIs
======================================================================
Implement the Lexer for Icing Advanced Scoring Language
======================================================================
Add a common interface for IcingSearchEngine and dynamite client
======================================================================
Implement a subset of the query grammar.
======================================================================
Refactor index processor
======================================================================
Add integer index into IcingSearchEngine and IndexProcessor
======================================================================
Implement the parser for Icing Advanced Scoring Language
======================================================================
Implement IntegerIndexData and PostingListUsedIntegerIndexDataSerializer
======================================================================
Add PostingListAccessor abstract class for common components and methods
======================================================================
Implement PostingListIntegerIndexDataAccessor
======================================================================
Create PostingListIntegerIndexDataAccessorTest
======================================================================
Fix Icing Segmentation tests for word connectors that changed in ICU 72.
======================================================================
Modify the Advanced Query grammar to allow functions to accept expressions.
======================================================================
Implement QueryVisitor.
======================================================================
Enable the Advanced Query Parser to handle member functions
======================================================================
Refactor the Scorer class to support the Advanced Scoring Language
======================================================================
Integrate advanced query parser with the query processor.
======================================================================
Implement support for JoinSpec in Icing.
======================================================================
Implement the Advanced Scoring Language for basic functions and operators
======================================================================
Bug: 208654892
Bug: 249829533
Bug: 256022027
Bug: 261474063
Bug: 240333360
Bug: 193919210
Change-Id: I5f5bdc6249282ecc4b014b4fbdf8e2d1f8b20c19
Diffstat (limited to 'icing/query/advanced_query_parser/parser.cc')
-rw-r--r-- | icing/query/advanced_query_parser/parser.cc | 414 |
1 files changed, 414 insertions, 0 deletions
diff --git a/icing/query/advanced_query_parser/parser.cc b/icing/query/advanced_query_parser/parser.cc new file mode 100644 index 0000000..086f038 --- /dev/null +++ b/icing/query/advanced_query_parser/parser.cc @@ -0,0 +1,414 @@ +// Copyright (C) 2022 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "icing/query/advanced_query_parser/parser.h" + +#include <memory> +#include <string_view> + +#include "icing/absl_ports/canonical_errors.h" +#include "icing/legacy/core/icing-string-util.h" +#include "icing/query/advanced_query_parser/abstract-syntax-tree.h" +#include "icing/util/status-macros.h" + +namespace icing { +namespace lib { + +namespace { + +std::unique_ptr<Node> CreateNaryNode( + std::string_view operator_text, + std::vector<std::unique_ptr<Node>>&& operands) { + if (operands.empty()) { + return nullptr; + } + if (operands.size() == 1) { + return std::move(operands.at(0)); + } + return std::make_unique<NaryOperatorNode>(std::string(operator_text), + std::move(operands)); +} + +} // namespace + +libtextclassifier3::Status Parser::Consume(Lexer::TokenType token_type) { + if (!Match(token_type)) { + return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf( + "Unable to consume token %d.", static_cast<int>(token_type))); + } + ++current_token_; + return libtextclassifier3::Status::OK; +} + +libtextclassifier3::StatusOr<std::unique_ptr<TextNode>> Parser::ConsumeText() { + if (!Match(Lexer::TokenType::TEXT)) { + return absl_ports::InvalidArgumentError("Unable to consume token as TEXT."); + } + auto text_node = std::make_unique<TextNode>(std::move(current_token_->text)); + ++current_token_; + return text_node; +} + +libtextclassifier3::StatusOr<std::unique_ptr<FunctionNameNode>> +Parser::ConsumeFunctionName() { + if (!Match(Lexer::TokenType::FUNCTION_NAME)) { + return absl_ports::InvalidArgumentError( + "Unable to consume token as FUNCTION_NAME."); + } + auto function_name_node = + std::make_unique<FunctionNameNode>(std::move(current_token_->text)); + ++current_token_; + return function_name_node; +} + +libtextclassifier3::StatusOr<std::unique_ptr<StringNode>> +Parser::ConsumeString() { + if (!Match(Lexer::TokenType::STRING)) { + return absl_ports::InvalidArgumentError( + "Unable to consume token as STRING."); + } + auto node = std::make_unique<StringNode>(std::move(current_token_->text)); + ++current_token_; + return node; +} + +libtextclassifier3::StatusOr<std::string> Parser::ConsumeComparator() { + if (!Match(Lexer::TokenType::COMPARATOR)) { + return absl_ports::InvalidArgumentError( + "Unable to consume token as COMPARATOR."); + } + std::string comparator = std::move(current_token_->text); + ++current_token_; + return comparator; +} + +// member +// : TEXT (DOT TEXT)* (DOT function)? +// ; +libtextclassifier3::StatusOr<std::unique_ptr<MemberNode>> +Parser::ConsumeMember() { + ICING_ASSIGN_OR_RETURN(std::unique_ptr<TextNode> text_node, ConsumeText()); + std::vector<std::unique_ptr<TextNode>> children; + children.push_back(std::move(text_node)); + + while (Match(Lexer::TokenType::DOT)) { + Consume(Lexer::TokenType::DOT); + if (MatchFunction()) { + ICING_ASSIGN_OR_RETURN(std::unique_ptr<FunctionNode> function_node, + ConsumeFunction()); + // Once a function is matched, we should exit the current rule based on + // the grammar. + return std::make_unique<MemberNode>(std::move(children), + std::move(function_node)); + } + ICING_ASSIGN_OR_RETURN(text_node, ConsumeText()); + children.push_back(std::move(text_node)); + } + return std::make_unique<MemberNode>(std::move(children), + /*function=*/nullptr); +} + +// function +// : FUNCTION_NAME LPAREN argList? RPAREN +// ; +libtextclassifier3::StatusOr<std::unique_ptr<FunctionNode>> +Parser::ConsumeFunction() { + ICING_ASSIGN_OR_RETURN(std::unique_ptr<FunctionNameNode> function_name, + ConsumeFunctionName()); + ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::LPAREN)); + + std::vector<std::unique_ptr<Node>> args; + if (Match(Lexer::TokenType::RPAREN)) { + // Got empty argument. + ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::RPAREN)); + } else { + ICING_ASSIGN_OR_RETURN(args, ConsumeArgs()); + ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::RPAREN)); + } + return std::make_unique<FunctionNode>(std::move(function_name), + std::move(args)); +} + +// comparable +// : STRING +// | member +// | function +// ; +libtextclassifier3::StatusOr<std::unique_ptr<Node>> +Parser::ConsumeComparable() { + if (Match(Lexer::TokenType::STRING)) { + return ConsumeString(); + } else if (MatchMember()) { + return ConsumeMember(); + } + // The current token sequence isn't a STRING or member. Therefore, it must be + // a function. + return ConsumeFunction(); +} + +// composite +// : LPAREN expression RPAREN +// ; +libtextclassifier3::StatusOr<std::unique_ptr<Node>> Parser::ConsumeComposite() { + ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::LPAREN)); + + ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> expression, ConsumeExpression()); + + ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::RPAREN)); + return expression; +} + +// argList +// : expression (COMMA expression)* +// ; +libtextclassifier3::StatusOr<std::vector<std::unique_ptr<Node>>> +Parser::ConsumeArgs() { + std::vector<std::unique_ptr<Node>> args; + ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> arg, ConsumeExpression()); + args.push_back(std::move(arg)); + while (Match(Lexer::TokenType::COMMA)) { + Consume(Lexer::TokenType::COMMA); + ICING_ASSIGN_OR_RETURN(arg, ConsumeExpression()); + args.push_back(std::move(arg)); + } + return args; +} + +// restriction +// : comparable (COMPARATOR (comparable | composite))? +// ; +// COMPARATOR will not be produced in Scoring Lexer. +libtextclassifier3::StatusOr<std::unique_ptr<Node>> +Parser::ConsumeRestriction() { + ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> comparable, ConsumeComparable()); + + if (!Match(Lexer::TokenType::COMPARATOR)) { + return comparable; + } + ICING_ASSIGN_OR_RETURN(std::string operator_text, ConsumeComparator()); + std::unique_ptr<Node> arg; + if (MatchComposite()) { + ICING_ASSIGN_OR_RETURN(arg, ConsumeComposite()); + } else if (MatchComparable()) { + ICING_ASSIGN_OR_RETURN(arg, ConsumeComparable()); + } else { + return absl_ports::InvalidArgumentError( + "ARG: must begin with LPAREN or FIRST(comparable)"); + } + std::vector<std::unique_ptr<Node>> args; + args.push_back(std::move(comparable)); + args.push_back(std::move(arg)); + return std::make_unique<NaryOperatorNode>(std::move(operator_text), + std::move(args)); +} + +// simple +// : restriction +// | composite +// ; +libtextclassifier3::StatusOr<std::unique_ptr<Node>> Parser::ConsumeSimple() { + if (MatchComposite()) { + return ConsumeComposite(); + } else if (MatchRestriction()) { + return ConsumeRestriction(); + } + return absl_ports::InvalidArgumentError( + "SIMPLE: must be a restriction or composite"); +} + +// term +// : NOT? simple +// | MINUS simple +// ; +// NOT will not be produced in Scoring Lexer. +libtextclassifier3::StatusOr<std::unique_ptr<Node>> Parser::ConsumeTerm() { + if (!Match(Lexer::TokenType::NOT) && !Match(Lexer::TokenType::MINUS)) { + return ConsumeSimple(); + } + std::string operator_text; + if (language_ == Lexer::Language::SCORING) { + ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::MINUS)); + operator_text = "MINUS"; + } else { + if (Match(Lexer::TokenType::NOT)) { + Consume(Lexer::TokenType::NOT); + } else { + Consume(Lexer::TokenType::MINUS); + } + operator_text = "NOT"; + } + ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> simple, ConsumeSimple()); + return std::make_unique<UnaryOperatorNode>(operator_text, std::move(simple)); +} + +// factor +// : term (OR term)* +// ; +libtextclassifier3::StatusOr<std::unique_ptr<Node>> Parser::ConsumeFactor() { + ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> term, ConsumeTerm()); + std::vector<std::unique_ptr<Node>> terms; + terms.push_back(std::move(term)); + + while (Match(Lexer::TokenType::OR)) { + Consume(Lexer::TokenType::OR); + ICING_ASSIGN_OR_RETURN(term, ConsumeTerm()); + terms.push_back(std::move(term)); + } + + return CreateNaryNode("OR", std::move(terms)); +} + +// sequence +// : (factor)+ +// ; +libtextclassifier3::StatusOr<std::unique_ptr<Node>> Parser::ConsumeSequence() { + ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> factor, ConsumeFactor()); + std::vector<std::unique_ptr<Node>> factors; + factors.push_back(std::move(factor)); + + while (MatchFactor()) { + ICING_ASSIGN_OR_RETURN(factor, ConsumeFactor()); + factors.push_back(std::move(factor)); + } + + return CreateNaryNode("AND", std::move(factors)); +} + +// expression +// : sequence (AND sequence)* +// ; +libtextclassifier3::StatusOr<std::unique_ptr<Node>> +Parser::ConsumeQueryExpression() { + ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> sequence, ConsumeSequence()); + std::vector<std::unique_ptr<Node>> sequences; + sequences.push_back(std::move(sequence)); + + while (Match(Lexer::TokenType::AND)) { + Consume(Lexer::TokenType::AND); + ICING_ASSIGN_OR_RETURN(sequence, ConsumeSequence()); + sequences.push_back(std::move(sequence)); + } + + return CreateNaryNode("AND", std::move(sequences)); +} + +// multExpr +// : term ((TIMES | DIV) term)* +// ; +libtextclassifier3::StatusOr<std::unique_ptr<Node>> Parser::ConsumeMultExpr() { + ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> node, ConsumeTerm()); + std::vector<std::unique_ptr<Node>> stack; + stack.push_back(std::move(node)); + + while (Match(Lexer::TokenType::TIMES) || Match(Lexer::TokenType::DIV)) { + while (Match(Lexer::TokenType::TIMES)) { + Consume(Lexer::TokenType::TIMES); + ICING_ASSIGN_OR_RETURN(node, ConsumeTerm()); + stack.push_back(std::move(node)); + } + node = CreateNaryNode("TIMES", std::move(stack)); + stack.clear(); + stack.push_back(std::move(node)); + + while (Match(Lexer::TokenType::DIV)) { + Consume(Lexer::TokenType::DIV); + ICING_ASSIGN_OR_RETURN(node, ConsumeTerm()); + stack.push_back(std::move(node)); + } + node = CreateNaryNode("DIV", std::move(stack)); + stack.clear(); + stack.push_back(std::move(node)); + } + + return std::move(stack[0]); +} + +// expression +// : multExpr ((PLUS | MINUS) multExpr)* +// ; +libtextclassifier3::StatusOr<std::unique_ptr<Node>> +Parser::ConsumeScoringExpression() { + ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> node, ConsumeMultExpr()); + std::vector<std::unique_ptr<Node>> stack; + stack.push_back(std::move(node)); + + while (Match(Lexer::TokenType::PLUS) || Match(Lexer::TokenType::MINUS)) { + while (Match(Lexer::TokenType::PLUS)) { + Consume(Lexer::TokenType::PLUS); + ICING_ASSIGN_OR_RETURN(node, ConsumeMultExpr()); + stack.push_back(std::move(node)); + } + node = CreateNaryNode("PLUS", std::move(stack)); + stack.clear(); + stack.push_back(std::move(node)); + + while (Match(Lexer::TokenType::MINUS)) { + Consume(Lexer::TokenType::MINUS); + ICING_ASSIGN_OR_RETURN(node, ConsumeMultExpr()); + stack.push_back(std::move(node)); + } + node = CreateNaryNode("MINUS", std::move(stack)); + stack.clear(); + stack.push_back(std::move(node)); + } + + return std::move(stack[0]); +} + +libtextclassifier3::StatusOr<std::unique_ptr<Node>> +Parser::ConsumeExpression() { + switch (language_) { + case Lexer::Language::QUERY: + return ConsumeQueryExpression(); + case Lexer::Language::SCORING: + return ConsumeScoringExpression(); + } +} + +// query +// : expression? EOF +// ; +libtextclassifier3::StatusOr<std::unique_ptr<Node>> Parser::ConsumeQuery() { + language_ = Lexer::Language::QUERY; + std::unique_ptr<Node> node; + if (current_token_ != lexer_tokens_.end()) { + ICING_ASSIGN_OR_RETURN(node, ConsumeExpression()); + } + if (current_token_ != lexer_tokens_.end()) { + return absl_ports::InvalidArgumentError( + "Error parsing Query. Must reach EOF after parsing Expression!"); + } + return node; +} + +// scoring +// : expression EOF +// ; +libtextclassifier3::StatusOr<std::unique_ptr<Node>> Parser::ConsumeScoring() { + language_ = Lexer::Language::SCORING; + std::unique_ptr<Node> node; + if (current_token_ == lexer_tokens_.end()) { + return absl_ports::InvalidArgumentError("Got empty scoring expression!"); + } + ICING_ASSIGN_OR_RETURN(node, ConsumeExpression()); + if (current_token_ != lexer_tokens_.end()) { + return absl_ports::InvalidArgumentError( + "Error parsing the scoring expression. Must reach EOF after parsing " + "Expression!"); + } + return node; +} + +} // namespace lib +} // namespace icing |