Sync from upstream.

Descriptions: ====================================================================== Add ScoringSpec into JoinSpec. Rename joined_document to child_document. ====================================================================== Create JoinedScoredDocumentHit class and refactor ScoredDocumentHitsRanker. ====================================================================== Implement initial Join workflow ====================================================================== Implement the Lexer for Icing Advanced Query Language ====================================================================== Create struct Options for PersistentHashMap ====================================================================== Premapping FileBackedVector ====================================================================== Create class PersistentHashMapKeyMapper ====================================================================== Add integer sections into TokenizedDocument and rename string sections ====================================================================== Create NumericIndex interface and DocHitInfoIteratorNumeric ====================================================================== Implement DummyNumericIndex and unit test ====================================================================== Change PostingListAccessor::Finalize to rvalue member function ====================================================================== Define the Abstract Syntax Tree for Icing's list_filter parser. ====================================================================== Refactor query processing and score ====================================================================== Refactor IcingSearchEngine for AppSearch Dynamite Module 0p APIs ====================================================================== Implement the Lexer for Icing Advanced Scoring Language ====================================================================== Add a common interface for IcingSearchEngine and dynamite client ====================================================================== Implement a subset of the query grammar. ====================================================================== Refactor index processor ====================================================================== Add integer index into IcingSearchEngine and IndexProcessor ====================================================================== Implement the parser for Icing Advanced Scoring Language ====================================================================== Implement IntegerIndexData and PostingListUsedIntegerIndexDataSerializer ====================================================================== Add PostingListAccessor abstract class for common components and methods ====================================================================== Implement PostingListIntegerIndexDataAccessor ====================================================================== Create PostingListIntegerIndexDataAccessorTest ====================================================================== Fix Icing Segmentation tests for word connectors that changed in ICU 72. ====================================================================== Modify the Advanced Query grammar to allow functions to accept expressions. ====================================================================== Implement QueryVisitor. ====================================================================== Enable the Advanced Query Parser to handle member functions ====================================================================== Refactor the Scorer class to support the Advanced Scoring Language ====================================================================== Integrate advanced query parser with the query processor. ====================================================================== Implement support for JoinSpec in Icing. ====================================================================== Implement the Advanced Scoring Language for basic functions and operators ====================================================================== Bug: 208654892 Bug: 249829533 Bug: 256022027 Bug: 261474063 Bug: 240333360 Bug: 193919210 Change-Id: I5f5bdc6249282ecc4b014b4fbdf8e2d1f8b20c19
author: Tim Barron <tjbarron@google.com> 2022-12-12 18:03:05 -0800
committer: Tim Barron <tjbarron@google.com> 2022-12-12 18:03:05 -0800
commit: 8ddc32ad433ea147de80dcfac2afe58962360f18 (patch)
tree: 50c30cb98396499bf1d6caf33b383f7f4bbc7e58 /icing/query/advanced_query_parser/parser.cc
parent: 2658f90984737e5bf6c76d82024103dccd4d51c6 (diff)
download: icing-8ddc32ad433ea147de80dcfac2afe58962360f18.tar.gz
1 files changed, 414 insertions, 0 deletions
diff --git a/icing/query/advanced_query_parser/parser.cc b/icing/query/advanced_query_parser/parser.cc
new file mode 100644
index 0000000..086f038
--- /dev/null
+++ b/icing/query/advanced_query_parser/parser.cc
@@ -0,0 +1,414 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "icing/query/advanced_query_parser/parser.h"
+
+#include <memory>
+#include <string_view>
+
+#include "icing/absl_ports/canonical_errors.h"
+#include "icing/legacy/core/icing-string-util.h"
+#include "icing/query/advanced_query_parser/abstract-syntax-tree.h"
+#include "icing/util/status-macros.h"
+
+namespace icing {
+namespace lib {
+
+namespace {
+
+std::unique_ptr<Node> CreateNaryNode(
+    std::string_view operator_text,
+    std::vector<std::unique_ptr<Node>>&& operands) {
+  if (operands.empty()) {
+    return nullptr;
+  }
+  if (operands.size() == 1) {
+    return std::move(operands.at(0));
+  }
+  return std::make_unique<NaryOperatorNode>(std::string(operator_text),
+                                            std::move(operands));
+}
+
+}  // namespace
+
+libtextclassifier3::Status Parser::Consume(Lexer::TokenType token_type) {
+  if (!Match(token_type)) {
+    return absl_ports::InvalidArgumentError(IcingStringUtil::StringPrintf(
+        "Unable to consume token %d.", static_cast<int>(token_type)));
+  }
+  ++current_token_;
+  return libtextclassifier3::Status::OK;
+}
+
+libtextclassifier3::StatusOr<std::unique_ptr<TextNode>> Parser::ConsumeText() {
+  if (!Match(Lexer::TokenType::TEXT)) {
+    return absl_ports::InvalidArgumentError("Unable to consume token as TEXT.");
+  }
+  auto text_node = std::make_unique<TextNode>(std::move(current_token_->text));
+  ++current_token_;
+  return text_node;
+}
+
+libtextclassifier3::StatusOr<std::unique_ptr<FunctionNameNode>>
+Parser::ConsumeFunctionName() {
+  if (!Match(Lexer::TokenType::FUNCTION_NAME)) {
+    return absl_ports::InvalidArgumentError(
+        "Unable to consume token as FUNCTION_NAME.");
+  }
+  auto function_name_node =
+      std::make_unique<FunctionNameNode>(std::move(current_token_->text));
+  ++current_token_;
+  return function_name_node;
+}
+
+libtextclassifier3::StatusOr<std::unique_ptr<StringNode>>
+Parser::ConsumeString() {
+  if (!Match(Lexer::TokenType::STRING)) {
+    return absl_ports::InvalidArgumentError(
+        "Unable to consume token as STRING.");
+  }
+  auto node = std::make_unique<StringNode>(std::move(current_token_->text));
+  ++current_token_;
+  return node;
+}
+
+libtextclassifier3::StatusOr<std::string> Parser::ConsumeComparator() {
+  if (!Match(Lexer::TokenType::COMPARATOR)) {
+    return absl_ports::InvalidArgumentError(
+        "Unable to consume token as COMPARATOR.");
+  }
+  std::string comparator = std::move(current_token_->text);
+  ++current_token_;
+  return comparator;
+}
+
+// member
+//    :  TEXT (DOT TEXT)* (DOT function)?
+//    ;
+libtextclassifier3::StatusOr<std::unique_ptr<MemberNode>>
+Parser::ConsumeMember() {
+  ICING_ASSIGN_OR_RETURN(std::unique_ptr<TextNode> text_node, ConsumeText());
+  std::vector<std::unique_ptr<TextNode>> children;
+  children.push_back(std::move(text_node));
+
+  while (Match(Lexer::TokenType::DOT)) {
+    Consume(Lexer::TokenType::DOT);
+    if (MatchFunction()) {
+      ICING_ASSIGN_OR_RETURN(std::unique_ptr<FunctionNode> function_node,
+                             ConsumeFunction());
+      // Once a function is matched, we should exit the current rule based on
+      // the grammar.
+      return std::make_unique<MemberNode>(std::move(children),
+                                          std::move(function_node));
+    }
+    ICING_ASSIGN_OR_RETURN(text_node, ConsumeText());
+    children.push_back(std::move(text_node));
+  }
+  return std::make_unique<MemberNode>(std::move(children),
+                                      /*function=*/nullptr);
+}
+
+// function
+//    : FUNCTION_NAME LPAREN argList? RPAREN
+//    ;
+libtextclassifier3::StatusOr<std::unique_ptr<FunctionNode>>
+Parser::ConsumeFunction() {
+  ICING_ASSIGN_OR_RETURN(std::unique_ptr<FunctionNameNode> function_name,
+                         ConsumeFunctionName());
+  ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::LPAREN));
+
+  std::vector<std::unique_ptr<Node>> args;
+  if (Match(Lexer::TokenType::RPAREN)) {
+    // Got empty argument.
+    ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::RPAREN));
+  } else {
+    ICING_ASSIGN_OR_RETURN(args, ConsumeArgs());
+    ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::RPAREN));
+  }
+  return std::make_unique<FunctionNode>(std::move(function_name),
+                                        std::move(args));
+}
+
+// comparable
+//     : STRING
+//     | member
+//     | function
+//     ;
+libtextclassifier3::StatusOr<std::unique_ptr<Node>>
+Parser::ConsumeComparable() {
+  if (Match(Lexer::TokenType::STRING)) {
+    return ConsumeString();
+  } else if (MatchMember()) {
+    return ConsumeMember();
+  }
+  // The current token sequence isn't a STRING or member. Therefore, it must be
+  // a function.
+  return ConsumeFunction();
+}
+
+// composite
+//    : LPAREN expression RPAREN
+//    ;
+libtextclassifier3::StatusOr<std::unique_ptr<Node>> Parser::ConsumeComposite() {
+  ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::LPAREN));
+
+  ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> expression, ConsumeExpression());
+
+  ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::RPAREN));
+  return expression;
+}
+
+// argList
+//    : expression (COMMA expression)*
+//    ;
+libtextclassifier3::StatusOr<std::vector<std::unique_ptr<Node>>>
+Parser::ConsumeArgs() {
+  std::vector<std::unique_ptr<Node>> args;
+  ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> arg, ConsumeExpression());
+  args.push_back(std::move(arg));
+  while (Match(Lexer::TokenType::COMMA)) {
+    Consume(Lexer::TokenType::COMMA);
+    ICING_ASSIGN_OR_RETURN(arg, ConsumeExpression());
+    args.push_back(std::move(arg));
+  }
+  return args;
+}
+
+// restriction
+//     : comparable (COMPARATOR (comparable | composite))?
+//     ;
+// COMPARATOR will not be produced in Scoring Lexer.
+libtextclassifier3::StatusOr<std::unique_ptr<Node>>
+Parser::ConsumeRestriction() {
+  ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> comparable, ConsumeComparable());
+
+  if (!Match(Lexer::TokenType::COMPARATOR)) {
+    return comparable;
+  }
+  ICING_ASSIGN_OR_RETURN(std::string operator_text, ConsumeComparator());
+  std::unique_ptr<Node> arg;
+  if (MatchComposite()) {
+    ICING_ASSIGN_OR_RETURN(arg, ConsumeComposite());
+  } else if (MatchComparable()) {
+    ICING_ASSIGN_OR_RETURN(arg, ConsumeComparable());
+  } else {
+    return absl_ports::InvalidArgumentError(
+        "ARG: must begin with LPAREN or FIRST(comparable)");
+  }
+  std::vector<std::unique_ptr<Node>> args;
+  args.push_back(std::move(comparable));
+  args.push_back(std::move(arg));
+  return std::make_unique<NaryOperatorNode>(std::move(operator_text),
+                                            std::move(args));
+}
+
+// simple
+//     : restriction
+//     | composite
+//     ;
+libtextclassifier3::StatusOr<std::unique_ptr<Node>> Parser::ConsumeSimple() {
+  if (MatchComposite()) {
+    return ConsumeComposite();
+  } else if (MatchRestriction()) {
+    return ConsumeRestriction();
+  }
+  return absl_ports::InvalidArgumentError(
+      "SIMPLE: must be a restriction or composite");
+}
+
+// term
+//     : NOT? simple
+//     | MINUS simple
+//     ;
+// NOT will not be produced in Scoring Lexer.
+libtextclassifier3::StatusOr<std::unique_ptr<Node>> Parser::ConsumeTerm() {
+  if (!Match(Lexer::TokenType::NOT) && !Match(Lexer::TokenType::MINUS)) {
+    return ConsumeSimple();
+  }
+  std::string operator_text;
+  if (language_ == Lexer::Language::SCORING) {
+    ICING_RETURN_IF_ERROR(Consume(Lexer::TokenType::MINUS));
+    operator_text = "MINUS";
+  } else {
+    if (Match(Lexer::TokenType::NOT)) {
+      Consume(Lexer::TokenType::NOT);
+    } else {
+      Consume(Lexer::TokenType::MINUS);
+    }
+    operator_text = "NOT";
+  }
+  ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> simple, ConsumeSimple());
+  return std::make_unique<UnaryOperatorNode>(operator_text, std::move(simple));
+}
+
+// factor
+//     : term (OR term)*
+//     ;
+libtextclassifier3::StatusOr<std::unique_ptr<Node>> Parser::ConsumeFactor() {
+  ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> term, ConsumeTerm());
+  std::vector<std::unique_ptr<Node>> terms;
+  terms.push_back(std::move(term));
+
+  while (Match(Lexer::TokenType::OR)) {
+    Consume(Lexer::TokenType::OR);
+    ICING_ASSIGN_OR_RETURN(term, ConsumeTerm());
+    terms.push_back(std::move(term));
+  }
+
+  return CreateNaryNode("OR", std::move(terms));
+}
+
+// sequence
+//    : (factor)+
+//    ;
+libtextclassifier3::StatusOr<std::unique_ptr<Node>> Parser::ConsumeSequence() {
+  ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> factor, ConsumeFactor());
+  std::vector<std::unique_ptr<Node>> factors;
+  factors.push_back(std::move(factor));
+
+  while (MatchFactor()) {
+    ICING_ASSIGN_OR_RETURN(factor, ConsumeFactor());
+    factors.push_back(std::move(factor));
+  }
+
+  return CreateNaryNode("AND", std::move(factors));
+}
+
+// expression
+//    : sequence (AND sequence)*
+//    ;
+libtextclassifier3::StatusOr<std::unique_ptr<Node>>
+Parser::ConsumeQueryExpression() {
+  ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> sequence, ConsumeSequence());
+  std::vector<std::unique_ptr<Node>> sequences;
+  sequences.push_back(std::move(sequence));
+
+  while (Match(Lexer::TokenType::AND)) {
+    Consume(Lexer::TokenType::AND);
+    ICING_ASSIGN_OR_RETURN(sequence, ConsumeSequence());
+    sequences.push_back(std::move(sequence));
+  }
+
+  return CreateNaryNode("AND", std::move(sequences));
+}
+
+// multExpr
+//     : term ((TIMES | DIV) term)*
+//     ;
+libtextclassifier3::StatusOr<std::unique_ptr<Node>> Parser::ConsumeMultExpr() {
+  ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> node, ConsumeTerm());
+  std::vector<std::unique_ptr<Node>> stack;
+  stack.push_back(std::move(node));
+
+  while (Match(Lexer::TokenType::TIMES) || Match(Lexer::TokenType::DIV)) {
+    while (Match(Lexer::TokenType::TIMES)) {
+      Consume(Lexer::TokenType::TIMES);
+      ICING_ASSIGN_OR_RETURN(node, ConsumeTerm());
+      stack.push_back(std::move(node));
+    }
+    node = CreateNaryNode("TIMES", std::move(stack));
+    stack.clear();
+    stack.push_back(std::move(node));
+
+    while (Match(Lexer::TokenType::DIV)) {
+      Consume(Lexer::TokenType::DIV);
+      ICING_ASSIGN_OR_RETURN(node, ConsumeTerm());
+      stack.push_back(std::move(node));
+    }
+    node = CreateNaryNode("DIV", std::move(stack));
+    stack.clear();
+    stack.push_back(std::move(node));
+  }
+
+  return std::move(stack[0]);
+}
+
+// expression
+//    : multExpr ((PLUS | MINUS) multExpr)*
+//    ;
+libtextclassifier3::StatusOr<std::unique_ptr<Node>>
+Parser::ConsumeScoringExpression() {
+  ICING_ASSIGN_OR_RETURN(std::unique_ptr<Node> node, ConsumeMultExpr());
+  std::vector<std::unique_ptr<Node>> stack;
+  stack.push_back(std::move(node));
+
+  while (Match(Lexer::TokenType::PLUS) || Match(Lexer::TokenType::MINUS)) {
+    while (Match(Lexer::TokenType::PLUS)) {
+      Consume(Lexer::TokenType::PLUS);
+      ICING_ASSIGN_OR_RETURN(node, ConsumeMultExpr());
+      stack.push_back(std::move(node));
+    }
+    node = CreateNaryNode("PLUS", std::move(stack));
+    stack.clear();
+    stack.push_back(std::move(node));
+
+    while (Match(Lexer::TokenType::MINUS)) {
+      Consume(Lexer::TokenType::MINUS);
+      ICING_ASSIGN_OR_RETURN(node, ConsumeMultExpr());
+      stack.push_back(std::move(node));
+    }
+    node = CreateNaryNode("MINUS", std::move(stack));
+    stack.clear();
+    stack.push_back(std::move(node));
+  }
+
+  return std::move(stack[0]);
+}
+
+libtextclassifier3::StatusOr<std::unique_ptr<Node>>
+Parser::ConsumeExpression() {
+  switch (language_) {
+    case Lexer::Language::QUERY:
+      return ConsumeQueryExpression();
+    case Lexer::Language::SCORING:
+      return ConsumeScoringExpression();
+  }
+}
+
+// query
+//     : expression? EOF
+//     ;
+libtextclassifier3::StatusOr<std::unique_ptr<Node>> Parser::ConsumeQuery() {
+  language_ = Lexer::Language::QUERY;
+  std::unique_ptr<Node> node;
+  if (current_token_ != lexer_tokens_.end()) {
+    ICING_ASSIGN_OR_RETURN(node, ConsumeExpression());
+  }
+  if (current_token_ != lexer_tokens_.end()) {
+    return absl_ports::InvalidArgumentError(
+        "Error parsing Query. Must reach EOF after parsing Expression!");
+  }
+  return node;
+}
+
+// scoring
+//     : expression EOF
+//     ;
+libtextclassifier3::StatusOr<std::unique_ptr<Node>> Parser::ConsumeScoring() {
+  language_ = Lexer::Language::SCORING;
+  std::unique_ptr<Node> node;
+  if (current_token_ == lexer_tokens_.end()) {
+    return absl_ports::InvalidArgumentError("Got empty scoring expression!");
+  }
+  ICING_ASSIGN_OR_RETURN(node, ConsumeExpression());
+  if (current_token_ != lexer_tokens_.end()) {
+    return absl_ports::InvalidArgumentError(
+        "Error parsing the scoring expression. Must reach EOF after parsing "
+        "Expression!");
+  }
+  return node;
+}
+
+}  // namespace lib
+}  // namespace icing
author	Tim Barron <tjbarron@google.com>	2022-12-12 18:03:05 -0800
committer	Tim Barron <tjbarron@google.com>	2022-12-12 18:03:05 -0800
commit	8ddc32ad433ea147de80dcfac2afe58962360f18 (patch)
tree	50c30cb98396499bf1d6caf33b383f7f4bbc7e58 /icing/query/advanced_query_parser/parser.cc
parent	2658f90984737e5bf6c76d82024103dccd4d51c6 (diff)
download	icing-8ddc32ad433ea147de80dcfac2afe58962360f18.tar.gz