diff options
Diffstat (limited to 'icing/query/advanced_query_parser/lexer.h')
-rw-r--r-- | icing/query/advanced_query_parser/lexer.h | 153 |
1 files changed, 153 insertions, 0 deletions
diff --git a/icing/query/advanced_query_parser/lexer.h b/icing/query/advanced_query_parser/lexer.h new file mode 100644 index 0000000..f72affb --- /dev/null +++ b/icing/query/advanced_query_parser/lexer.h @@ -0,0 +1,153 @@ +// Copyright (C) 2022 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef ICING_QUERY_ADVANCED_QUERY_PARSER_LEXER_H_ +#define ICING_QUERY_ADVANCED_QUERY_PARSER_LEXER_H_ + +#include <cstdint> +#include <string> +#include <string_view> +#include <vector> + +#include "icing/text_classifier/lib3/utils/base/statusor.h" + +namespace icing { +namespace lib { + +class Lexer { + public: + enum class Language { QUERY, SCORING }; + + enum class TokenType { + COMMA, // ',' + DOT, // '.' + PLUS, // '+' Not allowed in QUERY language. + MINUS, // '-' + TIMES, // '*' Not allowed in QUERY language. + DIV, // '/' Not allowed in QUERY language. + LPAREN, // '(' + RPAREN, // ')' + COMPARATOR, // '<=' | '<' | '>=' | '>' | '!=' | '==' | ':' + // Not allowed in SCORING language. + AND, // 'AND' | '&&' Not allowed in SCORING language. + OR, // 'OR' | '||' Not allowed in SCORING language. + NOT, // 'NOT' Not allowed in SCORING language. + STRING, // String literal surrounded by quotation marks + TEXT, // A sequence of chars that are not any above-listed operator + FUNCTION_NAME, // A TEXT followed by LPAREN. + // Whitespaces not inside a string literal will be skipped. + // WS: " " | "\t" | "\n" | "\r" | "\f" -> skip ; + }; + + struct LexerToken { + // For STRING, text will contain the raw original text of the token + // in between quotation marks, without unescaping. + // + // For TEXT, text will contain the text of the token after unescaping all + // escaped characters. + // + // For FUNCTION_NAME, this field will contain the name of the function. + // + // For COMPARATOR, this field will contain the comparator. + // + // For other types, this field will be empty. + std::string text; + + // The type of the token. + TokenType type; + }; + + explicit Lexer(std::string_view query, Language language) + : query_(query), language_(language) { + Advance(); + } + + // Get a vector of LexerToken after lexing the query given in the constructor. + // + // Returns: + // A vector of LexerToken on success + // INVALID_ARGUMENT on syntax error. + libtextclassifier3::StatusOr<std::vector<LexerToken>> ExtractTokens(); + + private: + // Advance to current_index_ + n. + void Advance(uint32_t n = 1) { + if (current_index_ + n >= query_.size()) { + current_index_ = query_.size(); + current_char_ = '\0'; + } else { + current_index_ += n; + current_char_ = query_[current_index_]; + } + } + + // Get the character at current_index_ + n. + char PeekNext(uint32_t n = 1) { + if (current_index_ + n >= query_.size()) { + return '\0'; + } else { + return query_[current_index_ + n]; + } + } + + void SyntaxError(std::string error) { + current_index_ = query_.size(); + current_char_ = '\0'; + error_ = std::move(error); + } + + // Try to match a whitespace token and skip it. + bool ConsumeWhitespace(); + + // Try to match a single-char token other than '<' and '>'. + bool ConsumeSingleChar(); + bool ConsumeQuerySingleChar(); + bool ConsumeScoringSingleChar(); + bool ConsumeGeneralSingleChar(); + + // Try to match a comparator token other than ':'. + bool ConsumeComparator(); + + // Try to match '&&' and '||'. + // 'AND' and 'OR' will be handled in Text() instead, so that 'ANDfoo' and + // 'fooOR' is a TEXT, instead of an 'AND' or 'OR'. + bool ConsumeAndOr(); + + // Try to match a string literal. + bool ConsumeStringLiteral(); + + // Try to match a non-text. + bool ConsumeNonText() { + return ConsumeWhitespace() || ConsumeSingleChar() || + (language_ == Language::QUERY && ConsumeComparator()) || + (language_ == Language::QUERY && ConsumeAndOr()) || + ConsumeStringLiteral(); + } + + // Try to match TEXT, FUNCTION_NAME, 'AND', 'OR' and 'NOT'. + // Should make sure that NonText() is false before calling into this method. + bool Text(); + + std::string_view query_; + std::string error_; + Language language_; + int32_t current_index_ = -1; + char current_char_ = '\0'; + std::vector<LexerToken> tokens_; +}; + +} // namespace lib +} // namespace icing + +#endif // ICING_QUERY_ADVANCED_QUERY_PARSER_LEXER_H_ |