aboutsummaryrefslogtreecommitdiff
path: root/icing/query/advanced_query_parser/lexer.h
diff options
context:
space:
mode:
Diffstat (limited to 'icing/query/advanced_query_parser/lexer.h')
-rw-r--r--icing/query/advanced_query_parser/lexer.h153
1 files changed, 153 insertions, 0 deletions
diff --git a/icing/query/advanced_query_parser/lexer.h b/icing/query/advanced_query_parser/lexer.h
new file mode 100644
index 0000000..f72affb
--- /dev/null
+++ b/icing/query/advanced_query_parser/lexer.h
@@ -0,0 +1,153 @@
+// Copyright (C) 2022 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef ICING_QUERY_ADVANCED_QUERY_PARSER_LEXER_H_
+#define ICING_QUERY_ADVANCED_QUERY_PARSER_LEXER_H_
+
+#include <cstdint>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "icing/text_classifier/lib3/utils/base/statusor.h"
+
+namespace icing {
+namespace lib {
+
+class Lexer {
+ public:
+ enum class Language { QUERY, SCORING };
+
+ enum class TokenType {
+ COMMA, // ','
+ DOT, // '.'
+ PLUS, // '+' Not allowed in QUERY language.
+ MINUS, // '-'
+ TIMES, // '*' Not allowed in QUERY language.
+ DIV, // '/' Not allowed in QUERY language.
+ LPAREN, // '('
+ RPAREN, // ')'
+ COMPARATOR, // '<=' | '<' | '>=' | '>' | '!=' | '==' | ':'
+ // Not allowed in SCORING language.
+ AND, // 'AND' | '&&' Not allowed in SCORING language.
+ OR, // 'OR' | '||' Not allowed in SCORING language.
+ NOT, // 'NOT' Not allowed in SCORING language.
+ STRING, // String literal surrounded by quotation marks
+ TEXT, // A sequence of chars that are not any above-listed operator
+ FUNCTION_NAME, // A TEXT followed by LPAREN.
+ // Whitespaces not inside a string literal will be skipped.
+ // WS: " " | "\t" | "\n" | "\r" | "\f" -> skip ;
+ };
+
+ struct LexerToken {
+ // For STRING, text will contain the raw original text of the token
+ // in between quotation marks, without unescaping.
+ //
+ // For TEXT, text will contain the text of the token after unescaping all
+ // escaped characters.
+ //
+ // For FUNCTION_NAME, this field will contain the name of the function.
+ //
+ // For COMPARATOR, this field will contain the comparator.
+ //
+ // For other types, this field will be empty.
+ std::string text;
+
+ // The type of the token.
+ TokenType type;
+ };
+
+ explicit Lexer(std::string_view query, Language language)
+ : query_(query), language_(language) {
+ Advance();
+ }
+
+ // Get a vector of LexerToken after lexing the query given in the constructor.
+ //
+ // Returns:
+ // A vector of LexerToken on success
+ // INVALID_ARGUMENT on syntax error.
+ libtextclassifier3::StatusOr<std::vector<LexerToken>> ExtractTokens();
+
+ private:
+ // Advance to current_index_ + n.
+ void Advance(uint32_t n = 1) {
+ if (current_index_ + n >= query_.size()) {
+ current_index_ = query_.size();
+ current_char_ = '\0';
+ } else {
+ current_index_ += n;
+ current_char_ = query_[current_index_];
+ }
+ }
+
+ // Get the character at current_index_ + n.
+ char PeekNext(uint32_t n = 1) {
+ if (current_index_ + n >= query_.size()) {
+ return '\0';
+ } else {
+ return query_[current_index_ + n];
+ }
+ }
+
+ void SyntaxError(std::string error) {
+ current_index_ = query_.size();
+ current_char_ = '\0';
+ error_ = std::move(error);
+ }
+
+ // Try to match a whitespace token and skip it.
+ bool ConsumeWhitespace();
+
+ // Try to match a single-char token other than '<' and '>'.
+ bool ConsumeSingleChar();
+ bool ConsumeQuerySingleChar();
+ bool ConsumeScoringSingleChar();
+ bool ConsumeGeneralSingleChar();
+
+ // Try to match a comparator token other than ':'.
+ bool ConsumeComparator();
+
+ // Try to match '&&' and '||'.
+ // 'AND' and 'OR' will be handled in Text() instead, so that 'ANDfoo' and
+ // 'fooOR' is a TEXT, instead of an 'AND' or 'OR'.
+ bool ConsumeAndOr();
+
+ // Try to match a string literal.
+ bool ConsumeStringLiteral();
+
+ // Try to match a non-text.
+ bool ConsumeNonText() {
+ return ConsumeWhitespace() || ConsumeSingleChar() ||
+ (language_ == Language::QUERY && ConsumeComparator()) ||
+ (language_ == Language::QUERY && ConsumeAndOr()) ||
+ ConsumeStringLiteral();
+ }
+
+ // Try to match TEXT, FUNCTION_NAME, 'AND', 'OR' and 'NOT'.
+ // Should make sure that NonText() is false before calling into this method.
+ bool Text();
+
+ std::string_view query_;
+ std::string error_;
+ Language language_;
+ int32_t current_index_ = -1;
+ char current_char_ = '\0';
+ std::vector<LexerToken> tokens_;
+};
+
+} // namespace lib
+} // namespace icing
+
+#endif // ICING_QUERY_ADVANCED_QUERY_PARSER_LEXER_H_